Import data from CSV file
library(readr)
bike_csv <- read_csv("bike_sharing_data.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 17379 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): datetime, sources
## dbl (11): season, holiday, workingday, weather, temp, atemp, humidity, winds...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
problems(bike_csv)
## # A tibble: 1 × 5
## row col expected actual file
## <int> <int> <chr> <chr> <chr>
## 1 14178 8 a double x61 ""
Import data from TXT file
bike_txt <- read.delim("bike_sharing_data.txt")
Data structure and summary
str(bike_csv)
## spc_tbl_ [17,379 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ datetime : chr [1:17379] "1/1/2011 0:00" "1/1/2011 1:00" "1/1/2011 2:00" "1/1/2011 3:00" ...
## $ season : num [1:17379] 1 1 1 1 1 1 1 1 1 1 ...
## $ holiday : num [1:17379] 0 0 0 0 0 0 0 0 0 0 ...
## $ workingday: num [1:17379] 0 0 0 0 0 0 0 0 0 0 ...
## $ weather : num [1:17379] 1 1 1 1 1 2 1 1 1 1 ...
## $ temp : num [1:17379] 9.84 9.02 9.02 9.84 9.84 ...
## $ atemp : num [1:17379] 14.4 13.6 13.6 14.4 14.4 ...
## $ humidity : num [1:17379] 81 80 80 75 75 75 80 86 75 76 ...
## $ windspeed : num [1:17379] 0 0 0 0 0 ...
## $ casual : num [1:17379] 3 8 5 3 0 0 2 1 1 8 ...
## $ registered: num [1:17379] 13 32 27 10 1 1 0 2 7 6 ...
## $ count : num [1:17379] 16 40 32 13 1 1 2 3 8 14 ...
## $ sources : chr [1:17379] "ad campaign" "www.yahoo.com" "www.google.fi" "AD campaign" ...
## - attr(*, "spec")=
## .. cols(
## .. datetime = col_character(),
## .. season = col_double(),
## .. holiday = col_double(),
## .. workingday = col_double(),
## .. weather = col_double(),
## .. temp = col_double(),
## .. atemp = col_double(),
## .. humidity = col_double(),
## .. windspeed = col_double(),
## .. casual = col_double(),
## .. registered = col_double(),
## .. count = col_double(),
## .. sources = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
summary(bike_csv)
## datetime season holiday workingday
## Length:17379 Min. :1.000 Min. :0.00000 Min. :0.0000
## Class :character 1st Qu.:2.000 1st Qu.:0.00000 1st Qu.:0.0000
## Mode :character Median :3.000 Median :0.00000 Median :1.0000
## Mean :2.502 Mean :0.02877 Mean :0.6827
## 3rd Qu.:3.000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :4.000 Max. :1.00000 Max. :1.0000
##
## weather temp atemp humidity
## Min. :1.000 Min. : 0.82 Min. : 0.00 Min. : 0.00
## 1st Qu.:1.000 1st Qu.:13.94 1st Qu.:16.66 1st Qu.: 48.00
## Median :1.000 Median :20.50 Median :24.24 Median : 63.00
## Mean :1.425 Mean :20.38 Mean :23.79 Mean : 62.72
## 3rd Qu.:2.000 3rd Qu.:27.06 3rd Qu.:31.06 3rd Qu.: 78.00
## Max. :4.000 Max. :41.00 Max. :50.00 Max. :100.00
## NA's :1
## windspeed casual registered count
## Min. : 0.000 Min. : 0.00 Min. : 0.0 Min. : 1
## 1st Qu.: 7.002 1st Qu.: 4.00 1st Qu.: 36.0 1st Qu.: 42
## Median :12.998 Median : 16.00 Median :116.0 Median :141
## Mean :12.737 Mean : 34.48 Mean :152.5 Mean :187
## 3rd Qu.:16.998 3rd Qu.: 46.00 3rd Qu.:217.0 3rd Qu.:277
## Max. :56.997 Max. :367.00 Max. :886.0 Max. :977
##
## sources
## Length:17379
## Class :character
## Mode :character
##
##
##
##
str(bike_txt)
## 'data.frame': 17379 obs. of 13 variables:
## $ datetime : chr "1/1/2011 0:00" "1/1/2011 1:00" "1/1/2011 2:00" "1/1/2011 3:00" ...
## $ season : int 1 1 1 1 1 1 1 1 1 1 ...
## $ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ workingday: int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather : int 1 1 1 1 1 2 1 1 1 1 ...
## $ temp : num 9.84 9.02 9.02 9.84 9.84 ...
## $ atemp : num 14.4 13.6 13.6 14.4 14.4 ...
## $ humidity : chr "81" "80" "80" "75" ...
## $ windspeed : num 0 0 0 0 0 ...
## $ casual : int 3 8 5 3 0 0 2 1 1 8 ...
## $ registered: int 13 32 27 10 1 1 0 2 7 6 ...
## $ count : int 16 40 32 13 1 1 2 3 8 14 ...
## $ sources : chr "ad campaign" "www.yahoo.com" "www.google.fi" "AD campaign" ...
summary(bike_txt)
## datetime season holiday workingday
## Length:17379 Min. :1.000 Min. :0.00000 Min. :0.0000
## Class :character 1st Qu.:2.000 1st Qu.:0.00000 1st Qu.:0.0000
## Mode :character Median :3.000 Median :0.00000 Median :1.0000
## Mean :2.502 Mean :0.02877 Mean :0.6827
## 3rd Qu.:3.000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :4.000 Max. :1.00000 Max. :1.0000
## weather temp atemp humidity
## Min. :1.000 Min. : 0.82 Min. : 0.00 Length:17379
## 1st Qu.:1.000 1st Qu.:13.94 1st Qu.:16.66 Class :character
## Median :1.000 Median :20.50 Median :24.24 Mode :character
## Mean :1.425 Mean :20.38 Mean :23.79
## 3rd Qu.:2.000 3rd Qu.:27.06 3rd Qu.:31.06
## Max. :4.000 Max. :41.00 Max. :50.00
## windspeed casual registered count
## Min. : 0.000 Min. : 0.00 Min. : 0.0 Min. : 1
## 1st Qu.: 7.002 1st Qu.: 4.00 1st Qu.: 36.0 1st Qu.: 42
## Median :12.998 Median : 16.00 Median :116.0 Median :141
## Mean :12.737 Mean : 34.48 Mean :152.5 Mean :187
## 3rd Qu.:16.998 3rd Qu.: 46.00 3rd Qu.:217.0 3rd Qu.:277
## Max. :56.997 Max. :367.00 Max. :886.0 Max. :977
## sources
## Length:17379
## Class :character
## Mode :character
##
##
##
Count the number of records based on a variable’s values
sort(table(bike_csv$season), deacreasing = TRUE)
##
## 4 1 2 3
## 4232 4242 4409 4496
Subset the data
bike_csv[8 ,9]
## # A tibble: 1 × 1
## windspeed
## <dbl>
## 1 0
Based on criteria
subset(bike_csv, season == 1)
## # A tibble: 4,242 × 13
## datetime season holiday workingday weather temp atemp humidity windspeed
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1/1/2011 0:… 1 0 0 1 9.84 14.4 81 0
## 2 1/1/2011 1:… 1 0 0 1 9.02 13.6 80 0
## 3 1/1/2011 2:… 1 0 0 1 9.02 13.6 80 0
## 4 1/1/2011 3:… 1 0 0 1 9.84 14.4 75 0
## 5 1/1/2011 4:… 1 0 0 1 9.84 14.4 75 0
## 6 1/1/2011 5:… 1 0 0 2 9.84 12.9 75 6.00
## 7 1/1/2011 6:… 1 0 0 1 9.02 13.6 80 0
## 8 1/1/2011 7:… 1 0 0 1 8.2 12.9 86 0
## 9 1/1/2011 8:… 1 0 0 1 9.84 14.4 75 0
## 10 1/1/2011 9:… 1 0 0 1 13.1 17.4 76 0
## # ℹ 4,232 more rows
## # ℹ 4 more variables: casual <dbl>, registered <dbl>, count <dbl>,
## # sources <chr>
Based on multiple criteria
subset(bike_csv, temp > 0.5 & workingday == 1)
## # A tibble: 11,865 × 13
## datetime season holiday workingday weather temp atemp humidity windspeed
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1/3/2011 0:… 1 0 1 1 9.02 9.85 44 24.0
## 2 1/3/2011 1:… 1 0 1 1 8.2 8.34 44 28.0
## 3 1/3/2011 4:… 1 0 1 1 6.56 6.82 47 26.0
## 4 1/3/2011 5:… 1 0 1 1 6.56 6.82 47 19.0
## 5 1/3/2011 6:… 1 0 1 1 5.74 5.30 50 26.0
## 6 1/3/2011 7:… 1 0 1 1 5.74 6.82 50 13.0
## 7 1/3/2011 8:… 1 0 1 1 5.74 6.06 50 19.0
## 8 1/3/2011 9:… 1 0 1 1 6.56 6.82 43 26.0
## 9 1/3/2011 10… 1 0 1 1 7.38 8.34 43 17.0
## 10 1/3/2011 11… 1 0 1 1 8.2 9.09 40 22.0
## # ℹ 11,855 more rows
## # ℹ 4 more variables: casual <dbl>, registered <dbl>, count <dbl>,
## # sources <chr>