Import data from CSV file

library(readr) 

bike_csv <- read_csv("bike_sharing_data.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 17379 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): datetime, sources
## dbl (11): season, holiday, workingday, weather, temp, atemp, humidity, winds...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
problems(bike_csv)
## # A tibble: 1 × 5
##     row   col expected actual file 
##   <int> <int> <chr>    <chr>  <chr>
## 1 14178     8 a double x61    ""

Import data from TXT file

bike_txt <- read.delim("bike_sharing_data.txt")

Data structure and summary

str(bike_csv)
## spc_tbl_ [17,379 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ datetime  : chr [1:17379] "1/1/2011 0:00" "1/1/2011 1:00" "1/1/2011 2:00" "1/1/2011 3:00" ...
##  $ season    : num [1:17379] 1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday   : num [1:17379] 0 0 0 0 0 0 0 0 0 0 ...
##  $ workingday: num [1:17379] 0 0 0 0 0 0 0 0 0 0 ...
##  $ weather   : num [1:17379] 1 1 1 1 1 2 1 1 1 1 ...
##  $ temp      : num [1:17379] 9.84 9.02 9.02 9.84 9.84 ...
##  $ atemp     : num [1:17379] 14.4 13.6 13.6 14.4 14.4 ...
##  $ humidity  : num [1:17379] 81 80 80 75 75 75 80 86 75 76 ...
##  $ windspeed : num [1:17379] 0 0 0 0 0 ...
##  $ casual    : num [1:17379] 3 8 5 3 0 0 2 1 1 8 ...
##  $ registered: num [1:17379] 13 32 27 10 1 1 0 2 7 6 ...
##  $ count     : num [1:17379] 16 40 32 13 1 1 2 3 8 14 ...
##  $ sources   : chr [1:17379] "ad campaign" "www.yahoo.com" "www.google.fi" "AD campaign" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   datetime = col_character(),
##   ..   season = col_double(),
##   ..   holiday = col_double(),
##   ..   workingday = col_double(),
##   ..   weather = col_double(),
##   ..   temp = col_double(),
##   ..   atemp = col_double(),
##   ..   humidity = col_double(),
##   ..   windspeed = col_double(),
##   ..   casual = col_double(),
##   ..   registered = col_double(),
##   ..   count = col_double(),
##   ..   sources = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(bike_csv)
##    datetime             season         holiday          workingday    
##  Length:17379       Min.   :1.000   Min.   :0.00000   Min.   :0.0000  
##  Class :character   1st Qu.:2.000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Mode  :character   Median :3.000   Median :0.00000   Median :1.0000  
##                     Mean   :2.502   Mean   :0.02877   Mean   :0.6827  
##                     3rd Qu.:3.000   3rd Qu.:0.00000   3rd Qu.:1.0000  
##                     Max.   :4.000   Max.   :1.00000   Max.   :1.0000  
##                                                                       
##     weather           temp           atemp          humidity     
##  Min.   :1.000   Min.   : 0.82   Min.   : 0.00   Min.   :  0.00  
##  1st Qu.:1.000   1st Qu.:13.94   1st Qu.:16.66   1st Qu.: 48.00  
##  Median :1.000   Median :20.50   Median :24.24   Median : 63.00  
##  Mean   :1.425   Mean   :20.38   Mean   :23.79   Mean   : 62.72  
##  3rd Qu.:2.000   3rd Qu.:27.06   3rd Qu.:31.06   3rd Qu.: 78.00  
##  Max.   :4.000   Max.   :41.00   Max.   :50.00   Max.   :100.00  
##                                                  NA's   :1       
##    windspeed          casual         registered        count    
##  Min.   : 0.000   Min.   :  0.00   Min.   :  0.0   Min.   :  1  
##  1st Qu.: 7.002   1st Qu.:  4.00   1st Qu.: 36.0   1st Qu.: 42  
##  Median :12.998   Median : 16.00   Median :116.0   Median :141  
##  Mean   :12.737   Mean   : 34.48   Mean   :152.5   Mean   :187  
##  3rd Qu.:16.998   3rd Qu.: 46.00   3rd Qu.:217.0   3rd Qu.:277  
##  Max.   :56.997   Max.   :367.00   Max.   :886.0   Max.   :977  
##                                                                 
##    sources         
##  Length:17379      
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
str(bike_txt)
## 'data.frame':    17379 obs. of  13 variables:
##  $ datetime  : chr  "1/1/2011 0:00" "1/1/2011 1:00" "1/1/2011 2:00" "1/1/2011 3:00" ...
##  $ season    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ workingday: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather   : int  1 1 1 1 1 2 1 1 1 1 ...
##  $ temp      : num  9.84 9.02 9.02 9.84 9.84 ...
##  $ atemp     : num  14.4 13.6 13.6 14.4 14.4 ...
##  $ humidity  : chr  "81" "80" "80" "75" ...
##  $ windspeed : num  0 0 0 0 0 ...
##  $ casual    : int  3 8 5 3 0 0 2 1 1 8 ...
##  $ registered: int  13 32 27 10 1 1 0 2 7 6 ...
##  $ count     : int  16 40 32 13 1 1 2 3 8 14 ...
##  $ sources   : chr  "ad campaign" "www.yahoo.com" "www.google.fi" "AD campaign" ...
summary(bike_txt)
##    datetime             season         holiday          workingday    
##  Length:17379       Min.   :1.000   Min.   :0.00000   Min.   :0.0000  
##  Class :character   1st Qu.:2.000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Mode  :character   Median :3.000   Median :0.00000   Median :1.0000  
##                     Mean   :2.502   Mean   :0.02877   Mean   :0.6827  
##                     3rd Qu.:3.000   3rd Qu.:0.00000   3rd Qu.:1.0000  
##                     Max.   :4.000   Max.   :1.00000   Max.   :1.0000  
##     weather           temp           atemp         humidity        
##  Min.   :1.000   Min.   : 0.82   Min.   : 0.00   Length:17379      
##  1st Qu.:1.000   1st Qu.:13.94   1st Qu.:16.66   Class :character  
##  Median :1.000   Median :20.50   Median :24.24   Mode  :character  
##  Mean   :1.425   Mean   :20.38   Mean   :23.79                     
##  3rd Qu.:2.000   3rd Qu.:27.06   3rd Qu.:31.06                     
##  Max.   :4.000   Max.   :41.00   Max.   :50.00                     
##    windspeed          casual         registered        count    
##  Min.   : 0.000   Min.   :  0.00   Min.   :  0.0   Min.   :  1  
##  1st Qu.: 7.002   1st Qu.:  4.00   1st Qu.: 36.0   1st Qu.: 42  
##  Median :12.998   Median : 16.00   Median :116.0   Median :141  
##  Mean   :12.737   Mean   : 34.48   Mean   :152.5   Mean   :187  
##  3rd Qu.:16.998   3rd Qu.: 46.00   3rd Qu.:217.0   3rd Qu.:277  
##  Max.   :56.997   Max.   :367.00   Max.   :886.0   Max.   :977  
##    sources         
##  Length:17379      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

Count the number of records based on a variable’s values

sort(table(bike_csv$season), deacreasing = TRUE)
## 
##    4    1    2    3 
## 4232 4242 4409 4496

Subset the data

bike_csv[8 ,9]
## # A tibble: 1 × 1
##   windspeed
##       <dbl>
## 1         0

Based on criteria

subset(bike_csv, season == 1)
## # A tibble: 4,242 × 13
##    datetime     season holiday workingday weather  temp atemp humidity windspeed
##    <chr>         <dbl>   <dbl>      <dbl>   <dbl> <dbl> <dbl>    <dbl>     <dbl>
##  1 1/1/2011 0:…      1       0          0       1  9.84  14.4       81      0   
##  2 1/1/2011 1:…      1       0          0       1  9.02  13.6       80      0   
##  3 1/1/2011 2:…      1       0          0       1  9.02  13.6       80      0   
##  4 1/1/2011 3:…      1       0          0       1  9.84  14.4       75      0   
##  5 1/1/2011 4:…      1       0          0       1  9.84  14.4       75      0   
##  6 1/1/2011 5:…      1       0          0       2  9.84  12.9       75      6.00
##  7 1/1/2011 6:…      1       0          0       1  9.02  13.6       80      0   
##  8 1/1/2011 7:…      1       0          0       1  8.2   12.9       86      0   
##  9 1/1/2011 8:…      1       0          0       1  9.84  14.4       75      0   
## 10 1/1/2011 9:…      1       0          0       1 13.1   17.4       76      0   
## # ℹ 4,232 more rows
## # ℹ 4 more variables: casual <dbl>, registered <dbl>, count <dbl>,
## #   sources <chr>

Based on multiple criteria

subset(bike_csv, temp > 0.5 & workingday == 1)
## # A tibble: 11,865 × 13
##    datetime     season holiday workingday weather  temp atemp humidity windspeed
##    <chr>         <dbl>   <dbl>      <dbl>   <dbl> <dbl> <dbl>    <dbl>     <dbl>
##  1 1/3/2011 0:…      1       0          1       1  9.02  9.85       44      24.0
##  2 1/3/2011 1:…      1       0          1       1  8.2   8.34       44      28.0
##  3 1/3/2011 4:…      1       0          1       1  6.56  6.82       47      26.0
##  4 1/3/2011 5:…      1       0          1       1  6.56  6.82       47      19.0
##  5 1/3/2011 6:…      1       0          1       1  5.74  5.30       50      26.0
##  6 1/3/2011 7:…      1       0          1       1  5.74  6.82       50      13.0
##  7 1/3/2011 8:…      1       0          1       1  5.74  6.06       50      19.0
##  8 1/3/2011 9:…      1       0          1       1  6.56  6.82       43      26.0
##  9 1/3/2011 10…      1       0          1       1  7.38  8.34       43      17.0
## 10 1/3/2011 11…      1       0          1       1  8.2   9.09       40      22.0
## # ℹ 11,855 more rows
## # ℹ 4 more variables: casual <dbl>, registered <dbl>, count <dbl>,
## #   sources <chr>