bike_csv <- read.csv("bike_sharing_data.csv")
kable(head(bike_csv))
datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | sources |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1/1/2011 0:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0.0000 | 3 | 13 | 16 | ad campaign |
1/1/2011 1:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0000 | 8 | 32 | 40 | www.yahoo.com |
1/1/2011 2:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0000 | 5 | 27 | 32 | www.google.fi |
1/1/2011 3:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0000 | 3 | 10 | 13 | AD campaign |
1/1/2011 4:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0000 | 0 | 1 | 1 | |
1/1/2011 5:00 | 1 | 0 | 0 | 2 | 9.84 | 12.880 | 75 | 6.0032 | 0 | 1 | 1 | www.bing.com |
str(bike_csv)
## 'data.frame': 17379 obs. of 13 variables:
## $ datetime : chr "1/1/2011 0:00" "1/1/2011 1:00" "1/1/2011 2:00" "1/1/2011 3:00" ...
## $ season : int 1 1 1 1 1 1 1 1 1 1 ...
## $ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ workingday: int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather : int 1 1 1 1 1 2 1 1 1 1 ...
## $ temp : num 9.84 9.02 9.02 9.84 9.84 ...
## $ atemp : num 14.4 13.6 13.6 14.4 14.4 ...
## $ humidity : chr "81" "80" "80" "75" ...
## $ windspeed : num 0 0 0 0 0 ...
## $ casual : int 3 8 5 3 0 0 2 1 1 8 ...
## $ registered: int 13 32 27 10 1 1 0 2 7 6 ...
## $ count : int 16 40 32 13 1 1 2 3 8 14 ...
## $ sources : chr "ad campaign" "www.yahoo.com" "www.google.fi" "AD campaign" ...
summary(bike_csv)
## datetime season holiday workingday
## Length:17379 Min. :1.000 Min. :0.00000 Min. :0.0000
## Class :character 1st Qu.:2.000 1st Qu.:0.00000 1st Qu.:0.0000
## Mode :character Median :3.000 Median :0.00000 Median :1.0000
## Mean :2.502 Mean :0.02877 Mean :0.6827
## 3rd Qu.:3.000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :4.000 Max. :1.00000 Max. :1.0000
## weather temp atemp humidity
## Min. :1.000 Min. : 0.82 Min. : 0.00 Length:17379
## 1st Qu.:1.000 1st Qu.:13.94 1st Qu.:16.66 Class :character
## Median :1.000 Median :20.50 Median :24.24 Mode :character
## Mean :1.425 Mean :20.38 Mean :23.79
## 3rd Qu.:2.000 3rd Qu.:27.06 3rd Qu.:31.06
## Max. :4.000 Max. :41.00 Max. :50.00
## windspeed casual registered count
## Min. : 0.000 Min. : 0.00 Min. : 0.0 Min. : 1
## 1st Qu.: 7.002 1st Qu.: 4.00 1st Qu.: 36.0 1st Qu.: 42
## Median :12.998 Median : 16.00 Median :116.0 Median :141
## Mean :12.737 Mean : 34.48 Mean :152.5 Mean :187
## 3rd Qu.:16.998 3rd Qu.: 46.00 3rd Qu.:217.0 3rd Qu.:277
## Max. :56.997 Max. :367.00 Max. :886.0 Max. :977
## sources
## Length:17379
## Class :character
## Mode :character
##
##
##
bike_txt <- read_tsv("bike_sharing_data.txt")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 17379 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): datetime, sources
## dbl (11): season, holiday, workingday, weather, temp, atemp, humidity, winds...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
kable(head(bike_txt))
datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | sources |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1/1/2011 0:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0.0000 | 3 | 13 | 16 | ad campaign |
1/1/2011 1:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0000 | 8 | 32 | 40 | www.yahoo.com |
1/1/2011 2:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0000 | 5 | 27 | 32 | www.google.fi |
1/1/2011 3:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0000 | 3 | 10 | 13 | AD campaign |
1/1/2011 4:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0000 | 0 | 1 | 1 | |
1/1/2011 5:00 | 1 | 0 | 0 | 2 | 9.84 | 12.880 | 75 | 6.0032 | 0 | 1 | 1 | www.bing.com |
bike_dt <- fread("bike_sharing_data.csv")
kable(head(bike_dt))
datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | sources |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1/1/2011 0:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0.0000 | 3 | 13 | 16 | ad campaign |
1/1/2011 1:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0000 | 8 | 32 | 40 | www.yahoo.com |
1/1/2011 2:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0000 | 5 | 27 | 32 | www.google.fi |
1/1/2011 3:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0000 | 3 | 10 | 13 | AD campaign |
1/1/2011 4:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0000 | 0 | 1 | 1 | |
1/1/2011 5:00 | 1 | 0 | 0 | 2 | 9.84 | 12.880 | 75 | 6.0032 | 0 | 1 | 1 | www.bing.com |
Business Intelligence combines tools, databases, and methodologies to analyze historical and current data for decision-making. Answer: A. True
Atomic vector → contains a sequence of same data type values (e.g., c(1,2,3)) Matrix → a 2D structure of the same data type List → can contain objects of different types (including vectors or other lists) Data frame → list of equal-length vectors with row/column structure
A function call consists of function name + arguments inside parentheses. Answer: A. True
# Import examples
bike1 <- read.table("bike_sharing_data.csv", sep=",", header=TRUE)
bike2 <- read.table("bike_sharing_data.txt", sep="\t", header=TRUE)
bike3 <- read.csv("bike_sharing_data.csv")
bike4 <- read.delim("bike_sharing_data.txt")
# Preview dataset
head(bike1)
## datetime season holiday workingday weather temp atemp humidity
## 1 1/1/2011 0:00 1 0 0 1 9.84 14.395 81
## 2 1/1/2011 1:00 1 0 0 1 9.02 13.635 80
## 3 1/1/2011 2:00 1 0 0 1 9.02 13.635 80
## 4 1/1/2011 3:00 1 0 0 1 9.84 14.395 75
## 5 1/1/2011 4:00 1 0 0 1 9.84 14.395 75
## 6 1/1/2011 5:00 1 0 0 2 9.84 12.880 75
## windspeed casual registered count sources
## 1 0.0000 3 13 16 ad campaign
## 2 0.0000 8 32 40 www.yahoo.com
## 3 0.0000 5 27 32 www.google.fi
## 4 0.0000 3 10 13 AD campaign
## 5 0.0000 0 1 1 Twitter
## 6 6.0032 0 1 1 www.bing.com
# Total rows and columns in the bike sharing dataset
dim(bike_csv)
## [1] 17379 13
# (bike1 or bike3 from your Q4 chunk)
str(bike1$humidity) # should display 'int'
## chr [1:17379] "81" "80" "80" "75" "75" "75" "80" "86" "75" "76" "76" "81" ...
typeof(bike1$humidity) # should display 'integer'
## [1] "character"
# Season value in row 6251
bike_csv[6251, "season"]
## [1] 4
table(bike_csv$season) # shows counts for 1..4
##
## 1 2 3 4
## 4242 4409 4496 4232
sum(bike_csv$season == 4) # 4232
## [1] 4232
# or, as in your quiz feedback:
dim(subset(bike_csv, season == 4))[1] # 4232
## [1] 4232
# Example: all winter rows with windspeed > 0.3
head(subset(bike_csv, season == 1 & windspeed > 0.3))
## datetime season holiday workingday weather temp atemp humidity
## 6 1/1/2011 5:00 1 0 0 2 9.84 12.880 75
## 11 1/1/2011 10:00 1 0 0 1 15.58 19.695 76
## 12 1/1/2011 11:00 1 0 0 1 14.76 16.665 81
## 13 1/1/2011 12:00 1 0 0 1 17.22 21.210 77
## 14 1/1/2011 13:00 1 0 0 2 18.86 22.725 72
## 15 1/1/2011 14:00 1 0 0 2 18.86 22.725 72
## windspeed casual registered count sources
## 6 6.0032 0 1 1 www.bing.com
## 11 16.9979 12 24 36 www.bing.com
## 12 19.0012 26 30 56 www.yahoo.com
## 13 19.0012 29 55 84 www.google.fi
## 14 19.9995 47 47 94 AD campaign
## 15 19.0012 35 71 106 www.google.co.uk
nrow(subset(bike_csv, windspeed >= 40 & season %in% c(1, 4))) # 63
## [1] 46
# Optional breakdown by season:
table(subset(bike_csv, windspeed >= 40 & season %in% c(1, 4))$season)
##
## 1 4
## 41 5