- Rpubs allows you to publish your RMarkdown documents on the web.
- Use this link to acess the presentation: www………
October 2020
# Read dataset
day <- read.csv('day.csv')
# Check dataset structure
str(day)
## 'data.frame': 731 obs. of 16 variables: ## $ instant : int 1 2 3 4 5 6 7 8 9 10 ... ## $ dteday : chr "2011-01-01" "2011-01-02" "2011-01-03" "2011-01-04" ... ## $ season : int 1 1 1 1 1 1 1 1 1 1 ... ## $ yr : int 0 0 0 0 0 0 0 0 0 0 ... ## $ mnth : int 1 1 1 1 1 1 1 1 1 1 ... ## $ holiday : int 0 0 0 0 0 0 0 0 0 0 ... ## $ weekday : int 6 0 1 2 3 4 5 6 0 1 ... ## $ workingday: int 0 0 1 1 1 1 1 0 0 1 ... ## $ weathersit: int 2 2 1 1 1 1 2 2 1 1 ... ## $ temp : num 0.344 0.363 0.196 0.2 0.227 ... ## $ atemp : num 0.364 0.354 0.189 0.212 0.229 ... ## $ hum : num 0.806 0.696 0.437 0.59 0.437 ... ## $ windspeed : num 0.16 0.249 0.248 0.16 0.187 ... ## $ casual : int 331 131 120 108 82 88 148 68 54 41 ... ## $ registered: int 654 670 1229 1454 1518 1518 1362 891 768 1280 ... ## $ cnt : int 985 801 1349 1562 1600 1606 1510 959 822 1321 ...
| Variable | Definition |
|---|---|
| instant | record index |
| dteday | date |
| season | 1:winter, 2:spring, 3:summer, 4:fall |
| yr | 0:2011, 1:2012 |
| mnth | month (1 to 12) |
| holiday | weather day is holiday or not |
| weekday | day of the week (0 to 6) |
| workingday | if day is neither weekend nor holiday is 1, otherwise is 0. |
| weathersit | 1:clear_sky, 2:cloudy, 3:ligth_snow_rain |
| temp | normalized temperature in Celsius, the values are derived via t_min=-8 and t_max=+39 |
| atemp | normalized feeling temperature in Celsius, the values are derived via t_min=-16 and t_max=+50 |
| hum | normalized humidity, the values are divided to 100 (max) |
| windspeed | normalized wind speed, the values are divided to 67 (max) |
| casual | count of casual users bike renters |
| registered | count of registered users bike renters |
| cnt | count of total rental bikes, the sum of variables casual and registered |
# Check missing values any(is.na(data))
## [1] FALSE
# Transform variable dteday as date
day$dteday <- as.Date(day$dteday, format = '%Y-%m-%d')
# Refine variable season
day$season <- factor(day$season, levels = 1:4, labels = c("winter","spring","summer","fall"))
# Refine variable weathersit
day$weathersit <- factor(day$weathersit, levels = 1:4, labels = c(
"clear_sky",
"cloudy",
"ligth_snow_rain",
"heavy_snow_rain"))
# Plot graph
ggplot(day, aes(x = dteday, y = cnt)) +
geom_line() + theme_minimal() + labs(x = 'Date', y = 'Number of Bikes Rented') +
theme(axis.text = element_text(size=14), axis.title = element_text(size=14))
ggplot(day, aes(y=cnt, fill = weathersit, x = weathersit)) +
geom_boxplot() + theme_minimal() + labs(x = '', fill = '', y = 'Number of Bikes Rented') +
theme(axis.title.x = element_blank(), axis.text.x = element_blank(),
legend.text = element_text(size=14), axis.title = element_text(size=14),
axis.text = element_text(size=14))
# Group data by seasson
table <- day %>% group_by(season) %>% summarise(min = min(cnt), lowerQuantile = quantile(cnt,probs = .25),
median = median(cnt), upperQuantile = quantile(cnt,probs = .75),
max = max(cnt), Mean = mean(cnt), standardDeviation = sd(cnt),
count = n())
# Print table
knitr::kable(table)
| season | min | lowerQuantile | median | upperQuantile | max | Mean | standardDeviation | count |
|---|---|---|---|---|---|---|---|---|
| winter | 431 | 1538.0 | 2209.0 | 3456.00 | 7836 | 2604.133 | 1399.942 | 181 |
| spring | 795 | 4003.0 | 4941.5 | 6377.00 | 8362 | 4992.332 | 1695.977 | 184 |
| summer | 1115 | 4586.5 | 5353.5 | 6929.25 | 8714 | 5644.303 | 1459.800 | 188 |
| fall | 22 | 3615.5 | 4634.5 | 5624.50 | 8555 | 4728.163 | 1699.615 | 178 |
\[y = \sum^n_{i = 1}B_0+B_i*x_i+E\]
\(y =\) dependent variable ; \(B_i =\) parameter
\(x_i =\) independent variable ; \(E =\) error
# Remove unwanted variables day <- select(day, -c(dteday, instant, casual, registered)) # Create multiple linear regression model model <- lm(cnt ~ ., data = day)
# Check model summary model %>% summary()
## ## Call: ## lm(formula = cnt ~ ., data = day) ## ## Residuals: ## Min 1Q Median 3Q Max ## -3617.0 -370.3 72.4 473.0 3128.9 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 1307.97 241.26 5.421 8.09e-08 *** ## seasonspring 1158.76 114.67 10.105 < 2e-16 *** ## seasonsummer 921.46 165.16 5.579 3.43e-08 *** ## seasonfall 1651.21 153.67 10.745 < 2e-16 *** ## yr 2018.97 61.02 33.089 < 2e-16 *** ## mnth -15.24 16.15 -0.943 0.34574 ## holiday -531.53 187.34 -2.837 0.00468 ** ## weekday 67.51 15.17 4.449 1.00e-05 *** ## workingday 116.17 67.10 1.731 0.08385 . ## weathersitcloudy -452.17 80.56 -5.613 2.85e-08 *** ## weathersitligth_snow_rain -1954.82 205.44 -9.515 < 2e-16 *** ## temp 3941.82 1380.69 2.855 0.00443 ** ## atemp 1290.20 1507.55 0.856 0.39238 ## hum -1198.19 294.51 -4.068 5.26e-05 *** ## windspeed -2708.11 429.85 -6.300 5.19e-10 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 812.2 on 716 degrees of freedom ## Multiple R-squared: 0.8276, Adjusted R-squared: 0.8242 ## F-statistic: 245.5 on 14 and 716 DF, p-value: < 2.2e-16
# Create variables importance dataframe varimport <- as.data.frame(varImp(model)) varimport$variable <- rownames(varimport) # Print ordered variables importance dataframe knitr::kable(varimport[order(varimport$Overall, decreasing = TRUE),])
| Overall | variable | |
|---|---|---|
| yr | 33.0893935 | yr |
| seasonfall | 10.7454714 | seasonfall |
| seasonspring | 10.1053497 | seasonspring |
| weathersitligth_snow_rain | 9.5151611 | weathersitligth_snow_rain |
| windspeed | 6.3001274 | windspeed |
| weathersitcloudy | 5.6127197 | weathersitcloudy |
| seasonsummer | 5.5792940 | seasonsummer |
| weekday | 4.4490027 | weekday |
| hum | 4.0684423 | hum |
| temp | 2.8549587 | temp |
| holiday | 2.8372380 | holiday |
| workingday | 1.7311647 | workingday |
| mnth | 0.9434979 | mnth |
| atemp | 0.8558282 | atemp |