read in the file
bike_sharing_data <- read.delim("bike_sharing_data.txt")
str(bike_sharing_data)
## 'data.frame': 17379 obs. of 13 variables:
## $ datetime : chr "1/1/2011 0:00" "1/1/2011 1:00" "1/1/2011 2:00" "1/1/2011 3:00" ...
## $ season : int 1 1 1 1 1 1 1 1 1 1 ...
## $ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ workingday: int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather : int 1 1 1 1 1 2 1 1 1 1 ...
## $ temp : num 9.84 9.02 9.02 9.84 9.84 ...
## $ atemp : num 14.4 13.6 13.6 14.4 14.4 ...
## $ humidity : chr "81" "80" "80" "75" ...
## $ windspeed : num 0 0 0 0 0 ...
## $ casual : int 3 8 5 3 0 0 2 1 1 8 ...
## $ registered: int 13 32 27 10 1 1 0 2 7 6 ...
## $ count : int 16 40 32 13 1 1 2 3 8 14 ...
## $ sources : chr "ad campaign" "www.yahoo.com" "www.google.fi" "AD campaign" ...
summarize the data
summary(bike_sharing_data)
## datetime season holiday workingday
## Length:17379 Min. :1.000 Min. :0.00000 Min. :0.0000
## Class :character 1st Qu.:2.000 1st Qu.:0.00000 1st Qu.:0.0000
## Mode :character Median :3.000 Median :0.00000 Median :1.0000
## Mean :2.502 Mean :0.02877 Mean :0.6827
## 3rd Qu.:3.000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :4.000 Max. :1.00000 Max. :1.0000
## weather temp atemp humidity
## Min. :1.000 Min. : 0.82 Min. : 0.00 Length:17379
## 1st Qu.:1.000 1st Qu.:13.94 1st Qu.:16.66 Class :character
## Median :1.000 Median :20.50 Median :24.24 Mode :character
## Mean :1.425 Mean :20.38 Mean :23.79
## 3rd Qu.:2.000 3rd Qu.:27.06 3rd Qu.:31.06
## Max. :4.000 Max. :41.00 Max. :50.00
## windspeed casual registered count
## Min. : 0.000 Min. : 0.00 Min. : 0.0 Min. : 1
## 1st Qu.: 7.002 1st Qu.: 4.00 1st Qu.: 36.0 1st Qu.: 42
## Median :12.998 Median : 16.00 Median :116.0 Median :141
## Mean :12.737 Mean : 34.48 Mean :152.5 Mean :187
## 3rd Qu.:16.998 3rd Qu.: 46.00 3rd Qu.:217.0 3rd Qu.:277
## Max. :56.997 Max. :367.00 Max. :886.0 Max. :977
## sources
## Length:17379
## Class :character
## Mode :character
##
##
##
preview the data
head(bike_sharing_data)
## datetime season holiday workingday weather temp atemp humidity
## 1 1/1/2011 0:00 1 0 0 1 9.84 14.395 81
## 2 1/1/2011 1:00 1 0 0 1 9.02 13.635 80
## 3 1/1/2011 2:00 1 0 0 1 9.02 13.635 80
## 4 1/1/2011 3:00 1 0 0 1 9.84 14.395 75
## 5 1/1/2011 4:00 1 0 0 1 9.84 14.395 75
## 6 1/1/2011 5:00 1 0 0 2 9.84 12.880 75
## windspeed casual registered count sources
## 1 0.0000 3 13 16 ad campaign
## 2 0.0000 8 32 40 www.yahoo.com
## 3 0.0000 5 27 32 www.google.fi
## 4 0.0000 3 10 13 AD campaign
## 5 0.0000 0 1 1 Twitter
## 6 6.0032 0 1 1 www.bing.com
tail(bike_sharing_data)
## datetime season holiday workingday weather temp atemp humidity
## 17374 12/31/2012 18:00 1 0 1 2 10.66 13.635 48
## 17375 12/31/2012 19:00 1 0 1 2 10.66 12.880 60
## 17376 12/31/2012 20:00 1 0 1 2 10.66 12.880 60
## 17377 12/31/2012 21:00 1 0 1 1 10.66 12.880 60
## 17378 12/31/2012 22:00 1 0 1 1 10.66 13.635 56
## 17379 12/31/2012 23:00 1 0 1 1 10.66 13.635 65
## windspeed casual registered count sources
## 17374 8.9981 10 326 336 facebook page
## 17375 11.0014 5 206 211 <NA>
## 17376 11.0014 4 140 144 AD campaign
## 17377 11.0014 3 96 99 AD campaign
## 17378 8.9981 4 90 94 ad campaign
## 17379 8.9981 3 50 53 direct
create a contingency table
sort(table(bike_sharing_data$sources), decreasing = TRUE)
##
## ad campaign www.yahoo.com direct www.bing.com
## 3472 1705 1610 1595
## www.google.co.uk facebook page AD campaign Twitter
## 1553 1551 894 890
## Twitter Ad Campaign www.google.fi www.google.com
## 855 851 828 527
## blog
## 494
indexing
bike_sharing_data[8,]
## datetime season holiday workingday weather temp atemp humidity windspeed
## 8 1/1/2011 7:00 1 0 0 1 8.2 12.88 86 0
## casual registered count sources
## 8 1 2 3 www.yahoo.com
bike_sharing_data[8,9]
## [1] 0
subsetting
cold_season_data <- subset(bike_sharing_data, season == 1)
head(cold_season_data)
## datetime season holiday workingday weather temp atemp humidity
## 1 1/1/2011 0:00 1 0 0 1 9.84 14.395 81
## 2 1/1/2011 1:00 1 0 0 1 9.02 13.635 80
## 3 1/1/2011 2:00 1 0 0 1 9.02 13.635 80
## 4 1/1/2011 3:00 1 0 0 1 9.84 14.395 75
## 5 1/1/2011 4:00 1 0 0 1 9.84 14.395 75
## 6 1/1/2011 5:00 1 0 0 2 9.84 12.880 75
## windspeed casual registered count sources
## 1 0.0000 3 13 16 ad campaign
## 2 0.0000 8 32 40 www.yahoo.com
## 3 0.0000 5 27 32 www.google.fi
## 4 0.0000 3 10 13 AD campaign
## 5 0.0000 0 1 1 Twitter
## 6 6.0032 0 1 1 www.bing.com
good_weather_high_count <- subset(bike_sharing_data, weather == 1 & count > 500)
head(good_weather_high_count)
## datetime season holiday workingday weather temp atemp humidity
## 2660 4/25/2011 17:00 2 0 1 1 28.70 32.575 54
## 2684 4/26/2011 17:00 2 0 1 1 27.88 31.820 61
## 2732 4/28/2011 17:00 2 0 1 1 26.24 31.060 47
## 2828 5/2/2011 17:00 2 0 1 1 27.06 31.060 65
## 2852 5/3/2011 17:00 2 0 1 1 28.70 32.575 54
## 2853 5/3/2011 18:00 2 0 1 1 28.70 32.575 48
## windspeed casual registered count sources
## 2660 19.9995 66 464 530 www.google.co.uk
## 2684 23.9994 69 477 546 www.yahoo.com
## 2732 23.9994 57 444 501 www.google.fi
## 2828 12.9980 65 472 537 www.yahoo.com
## 2852 31.0009 53 464 517 ad campaign
## 2853 27.9993 59 485 544 Twitter
high_humidity_data <- bike_sharing_data[bike_sharing_data$humidity > 90, ]
head(high_humidity_data)
## datetime season holiday workingday weather temp atemp humidity
## 23 1/1/2011 22:00 1 0 0 2 16.40 20.455 94
## 26 1/2/2011 1:00 1 0 0 2 18.04 21.970 94
## 28 1/2/2011 3:00 1 0 0 2 18.86 22.725 94
## 29 1/2/2011 4:00 1 0 0 2 18.86 22.725 94
## 170 1/8/2011 8:00 1 0 0 3 6.56 9.090 93
## 171 1/8/2011 9:00 1 0 0 3 6.56 9.090 93
## windspeed casual registered count sources
## 23 15.0013 11 17 28 Ad Campaign
## 26 16.9979 1 16 17 Twitter
## 28 12.9980 2 4 6 www.google.co.uk
## 29 12.9980 2 1 3 ad campaign
## 170 7.0015 0 15 15 www.google.com
## 171 7.0015 0 20 20 Twitter