library(tidyverse)
library(caret)
#Import data
bike_data <- read.csv("~/Desktop/BU7154 Foundations Of Business Analytics/Individual Assignment/main_Model_Bikesharing_data.csv")
The following code performs exploratory data analysis (EDA) and prepares the dataset for model building.
# View the first few rows of the dataset
head(bike_data)
# Check the structure of the dataset
str(bike_data)
## 'data.frame': 731 obs. of 15 variables:
## $ instant : int 1 2 3 4 5 6 7 8 9 10 ...
## $ season : int 1 1 1 1 1 1 1 1 1 1 ...
## $ yr : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mnth : int 1 1 1 1 1 1 1 1 1 1 ...
## $ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday : int 6 0 1 2 3 4 5 6 0 1 ...
## $ workingday: int 0 0 1 1 1 1 1 0 0 1 ...
## $ weathersit: int 2 2 1 1 1 1 2 2 1 1 ...
## $ temp : num 0.344 0.363 0.196 0.2 0.227 ...
## $ atemp : num 0.364 0.354 0.189 0.212 0.229 ...
## $ hum : num 0.806 0.696 0.437 0.59 0.437 ...
## $ windspeed : num 0.16 0.249 0.248 0.16 0.187 ...
## $ casual : int 331 131 120 108 82 88 148 68 54 41 ...
## $ registered: int 654 670 1229 1454 1518 1518 1362 891 768 1280 ...
## $ cnt : int 985 801 1349 1562 1600 1606 1510 959 822 1321 ...
# Check for missing values and remove rows with NAs.
sum(is.na(bike_data))
## [1] 0
bike_data <- na.omit(bike_data)
# Check the summary statistics of the dataset
summary(bike_data)
## instant season yr mnth
## Min. : 1.0 Min. :1.000 Min. :0.0000 Min. : 1.00
## 1st Qu.:183.5 1st Qu.:2.000 1st Qu.:0.0000 1st Qu.: 4.00
## Median :366.0 Median :3.000 Median :1.0000 Median : 7.00
## Mean :366.0 Mean :2.497 Mean :0.5007 Mean : 6.52
## 3rd Qu.:548.5 3rd Qu.:3.000 3rd Qu.:1.0000 3rd Qu.:10.00
## Max. :731.0 Max. :4.000 Max. :1.0000 Max. :12.00
## holiday weekday workingday weathersit
## Min. :0.00000 Min. :0.000 Min. :0.000 Min. :1.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:1.000
## Median :0.00000 Median :3.000 Median :1.000 Median :1.000
## Mean :0.02873 Mean :2.997 Mean :0.684 Mean :1.395
## 3rd Qu.:0.00000 3rd Qu.:5.000 3rd Qu.:1.000 3rd Qu.:2.000
## Max. :1.00000 Max. :6.000 Max. :1.000 Max. :3.000
## temp atemp hum windspeed
## Min. :0.05913 Min. :0.07907 Min. :0.0000 Min. :0.02239
## 1st Qu.:0.33708 1st Qu.:0.33784 1st Qu.:0.5200 1st Qu.:0.13495
## Median :0.49833 Median :0.48673 Median :0.6267 Median :0.18097
## Mean :0.49538 Mean :0.47435 Mean :0.6279 Mean :0.19049
## 3rd Qu.:0.65542 3rd Qu.:0.60860 3rd Qu.:0.7302 3rd Qu.:0.23321
## Max. :0.86167 Max. :0.84090 Max. :0.9725 Max. :0.50746
## casual registered cnt
## Min. : 2.0 Min. : 20 Min. : 22
## 1st Qu.: 315.5 1st Qu.:2497 1st Qu.:3152
## Median : 713.0 Median :3662 Median :4548
## Mean : 848.2 Mean :3656 Mean :4504
## 3rd Qu.:1096.0 3rd Qu.:4776 3rd Qu.:5956
## Max. :3410.0 Max. :6946 Max. :8714
# Examine the distribution of the target variable 'cnt'
hist(bike_data$cnt, main = "Distribution of 'cnt'", xlab = "Count of Total Rental Bikes")
# Check the distribution of the target variable 'cnt' after log transformation.
hist(log(bike_data$cnt + 1), main = "Distribution of log(cnt)", xlab = "Log Count of Total Rental Bikes")
#Adding 1 avoids log(0) error.
# Correlation matrix of numerical variables. Avoid including factors here.
numerical_vars <- bike_data %>% select_if(is.numeric)
correlation_matrix <- cor(numerical_vars)
print(correlation_matrix)
## instant season yr mnth holiday
## instant 1.000000e+00 0.412224179 0.866025404 0.496701889 0.016144632
## season 4.122242e-01 1.000000000 -0.001844343 0.831440114 -0.010536659
## yr 8.660254e-01 -0.001844343 1.000000000 -0.001792434 0.007954311
## mnth 4.967019e-01 0.831440114 -0.001792434 1.000000000 0.019190895
## holiday 1.614463e-02 -0.010536659 0.007954311 0.019190895 1.000000000
## weekday -1.617914e-05 -0.003079881 -0.005460765 0.009509313 -0.101960269
## workingday -4.336537e-03 0.012484963 -0.002012621 -0.005900951 -0.253022700
## weathersit -2.147721e-02 0.019211028 -0.048726541 0.043528098 -0.034626841
## temp 1.505803e-01 0.334314856 0.047603572 0.220205335 -0.028555535
## atemp 1.526382e-01 0.342875613 0.046106149 0.227458630 -0.032506692
## hum 1.637471e-02 0.205444765 -0.110651045 0.222203691 -0.015937479
## windspeed -1.126196e-01 -0.229046337 -0.011817060 -0.207501752 0.006291507
## casual 2.752552e-01 0.210399165 0.248545664 0.123005889 0.054274203
## registered 6.596229e-01 0.411623051 0.594248168 0.293487830 -0.108744863
## cnt 6.288303e-01 0.406100371 0.566709708 0.279977112 -0.068347716
## weekday workingday weathersit temp atemp
## instant -1.617914e-05 -0.004336537 -0.02147721 0.1505803019 0.152638238
## season -3.079881e-03 0.012484963 0.01921103 0.3343148564 0.342875613
## yr -5.460765e-03 -0.002012621 -0.04872654 0.0476035719 0.046106149
## mnth 9.509313e-03 -0.005900951 0.04352810 0.2202053352 0.227458630
## holiday -1.019603e-01 -0.253022700 -0.03462684 -0.0285555350 -0.032506692
## weekday 1.000000e+00 0.035789674 0.03108747 -0.0001699624 -0.007537132
## workingday 3.578967e-02 1.000000000 0.06120043 0.0526598102 0.052182275
## weathersit 3.108747e-02 0.061200430 1.00000000 -0.1206022365 -0.121583354
## temp -1.699624e-04 0.052659810 -0.12060224 1.0000000000 0.991701553
## atemp -7.537132e-03 0.052182275 -0.12158335 0.9917015532 1.000000000
## hum -5.223210e-02 0.024327046 0.59104460 0.1269629390 0.139988060
## windspeed 1.428212e-02 -0.018796487 0.03951106 -0.1579441204 -0.183642967
## casual 5.992264e-02 -0.518044191 -0.24735300 0.5432846617 0.543863690
## registered 5.736744e-02 0.303907117 -0.26038771 0.5400119662 0.544191758
## cnt 6.744341e-02 0.061156063 -0.29739124 0.6274940090 0.631065700
## hum windspeed casual registered cnt
## instant 0.01637471 -0.112619556 0.27525521 0.65962287 0.62883027
## season 0.20544476 -0.229046337 0.21039916 0.41162305 0.40610037
## yr -0.11065104 -0.011817060 0.24854566 0.59424817 0.56670971
## mnth 0.22220369 -0.207501752 0.12300589 0.29348783 0.27997711
## holiday -0.01593748 0.006291507 0.05427420 -0.10874486 -0.06834772
## weekday -0.05223210 0.014282124 0.05992264 0.05736744 0.06744341
## workingday 0.02432705 -0.018796487 -0.51804419 0.30390712 0.06115606
## weathersit 0.59104460 0.039511059 -0.24735300 -0.26038771 -0.29739124
## temp 0.12696294 -0.157944120 0.54328466 0.54001197 0.62749401
## atemp 0.13998806 -0.183642967 0.54386369 0.54419176 0.63106570
## hum 1.00000000 -0.248489099 -0.07700788 -0.09108860 -0.10065856
## windspeed -0.24848910 1.000000000 -0.16761335 -0.21744898 -0.23454500
## casual -0.07700788 -0.167613349 1.00000000 0.39528245 0.67280443
## registered -0.09108860 -0.217448981 0.39528245 1.00000000 0.94551692
## cnt -0.10065856 -0.234544997 0.67280443 0.94551692 1.00000000
# Visualize the correlation matrix (optional)
corrplot::corrplot(correlation_matrix, method = "color")
# Split the data into training and testing sets
set.seed(123)
trainIndex <- createDataPartition(bike_data$cnt, p = 0.8, list = FALSE)
train_data <- bike_data[trainIndex, ]
test_data <- bike_data[-trainIndex, ]
# Check the dimensions of the training and testing sets
dim(train_data)
## [1] 587 15
dim(test_data)
## [1] 144 15
# Check the summary statistics of the training set
summary(train_data)
## instant season yr mnth
## Min. : 1.0 Min. :1.000 Min. :0.0000 Min. : 1.000
## 1st Qu.:180.5 1st Qu.:2.000 1st Qu.:0.0000 1st Qu.: 4.000
## Median :360.0 Median :3.000 Median :0.0000 Median : 7.000
## Mean :364.1 Mean :2.516 Mean :0.4957 Mean : 6.518
## 3rd Qu.:548.5 3rd Qu.:3.500 3rd Qu.:1.0000 3rd Qu.:10.000
## Max. :730.0 Max. :4.000 Max. :1.0000 Max. :12.000
## holiday weekday workingday weathersit
## Min. :0.00000 Min. :0.00 Min. :0.0000 Min. :1.000
## 1st Qu.:0.00000 1st Qu.:1.00 1st Qu.:0.0000 1st Qu.:1.000
## Median :0.00000 Median :3.00 Median :1.0000 Median :1.000
## Mean :0.03578 Mean :2.92 Mean :0.6661 Mean :1.407
## 3rd Qu.:0.00000 3rd Qu.:5.00 3rd Qu.:1.0000 3rd Qu.:2.000
## Max. :1.00000 Max. :6.00 Max. :1.0000 Max. :3.000
## temp atemp hum windspeed
## Min. :0.05913 Min. :0.07907 Min. :0.0000 Min. :0.02239
## 1st Qu.:0.33875 1st Qu.:0.33751 1st Qu.:0.5238 1st Qu.:0.13309
## Median :0.49833 Median :0.48800 Median :0.6342 Median :0.18097
## Mean :0.49469 Mean :0.47435 Mean :0.6314 Mean :0.18986
## 3rd Qu.:0.65375 3rd Qu.:0.60797 3rd Qu.:0.7347 3rd Qu.:0.23289
## Max. :0.84917 Max. :0.84090 Max. :0.9725 Max. :0.50746
## casual registered cnt
## Min. : 2.0 Min. : 20 Min. : 22
## 1st Qu.: 318.0 1st Qu.:2506 1st Qu.:3152
## Median : 699.0 Median :3658 Median :4548
## Mean : 855.5 Mean :3640 Mean :4496
## 3rd Qu.:1125.0 3rd Qu.:4724 3rd Qu.:5956
## Max. :3283.0 Max. :6946 Max. :8714
# Check the summary statistics of the testing set
summary(test_data)
## instant season yr mnth holiday
## Min. : 2.0 Min. :1.000 Min. :0.0000 Min. : 1.000 Min. :0
## 1st Qu.:200.8 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.: 3.750 1st Qu.:0
## Median :375.0 Median :2.000 Median :1.0000 Median : 7.000 Median :0
## Mean :373.7 Mean :2.417 Mean :0.5208 Mean : 6.528 Mean :0
## 3rd Qu.:548.2 3rd Qu.:3.000 3rd Qu.:1.0000 3rd Qu.: 9.250 3rd Qu.:0
## Max. :731.0 Max. :4.000 Max. :1.0000 Max. :12.000 Max. :0
## weekday workingday weathersit temp
## Min. :0.000 Min. :0.0000 Min. :1.000 Min. :0.1508
## 1st Qu.:2.000 1st Qu.:1.0000 1st Qu.:1.000 1st Qu.:0.3348
## Median :3.000 Median :1.0000 Median :1.000 Median :0.4971
## Mean :3.312 Mean :0.7569 Mean :1.347 Mean :0.4982
## 3rd Qu.:5.000 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:0.6598
## Max. :6.000 Max. :1.0000 Max. :3.000 Max. :0.8617
## atemp hum windspeed casual
## Min. :0.1509 Min. :0.2758 Min. :0.04665 Min. : 9.0
## 1st Qu.:0.3388 1st Qu.:0.5182 1st Qu.:0.14085 1st Qu.: 255.0
## Median :0.4709 Median :0.5902 Median :0.18129 Median : 756.0
## Mean :0.4744 Mean :0.6135 Mean :0.19304 Mean : 818.2
## 3rd Qu.:0.6114 3rd Qu.:0.7039 3rd Qu.:0.24006 3rd Qu.:1070.0
## Max. :0.8264 Max. :0.9483 Max. :0.41791 Max. :3410.0
## registered cnt
## Min. : 491 Min. : 605
## 1st Qu.:2461 1st Qu.:3184
## Median :3680 Median :4553
## Mean :3721 Mean :4539
## 3rd Qu.:4892 3rd Qu.:5960
## Max. :6911 Max. :8294
# Check the distribution of the target variable 'cnt' in the training set
hist(train_data$cnt, main = "Distribution of 'cnt' in Training Set", xlab = "Count of Total Rental Bikes")
# Check the distribution of the target variable 'cnt' in the testing set
hist(test_data$cnt, main = "Distribution of 'cnt' in Testing Set", xlab = "Count of Total Rental Bikes")
selected_vars <- c("temp", "atemp", "hum", "windspeed", "cnt")
train_data <- train_data[, c("casual", selected_vars)]
test_data <- test_data[, c("casual", selected_vars)]
# Train a linear regression model
model <- lm(casual ~ ., data = train_data)
#The "." indicates all other variables in the data frame are included as predictors.
summary(model)
##
## Call:
## lm(formula = casual ~ ., data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -833.1 -355.6 -123.2 219.1 1892.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -252.57300 153.81184 -1.642 0.1011
## temp 768.75906 1443.18112 0.533 0.5945
## atemp 65.94128 1642.47185 0.040 0.9680
## hum -258.94677 154.44987 -1.677 0.0942 .
## windspeed -101.63321 295.08165 -0.344 0.7307
## cnt 0.19559 0.01456 13.429 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 501.2 on 581 degrees of freedom
## Multiple R-squared: 0.4897, Adjusted R-squared: 0.4853
## F-statistic: 111.5 on 5 and 581 DF, p-value: < 2.2e-16
# Make predictions on the test set
predictions <- predict(model, newdata = test_data)
# Evaluate the model using RMSE
rmse <- sqrt(mean((test_data$casual - predictions)^2))
# Evaluate the model using R-squared
r_squared <- cor(test_data$casual, predictions)^2
print(paste("RMSE:", rmse))
## [1] "RMSE: 481.801189765702"
print(paste("R-squared:", r_squared))
## [1] "R-squared: 0.447791165185354"
par(mfrow = c(2, 2)) #Set up a 2x2 plotting area
plot(model) #Generate diagnostic plots
Based on the model evaluation metrics, the linear regression model performed well in predicting the casual bike rentals. The RMSE value indicates the average error in the model’s predictions, while the R-squared value shows the proportion of variance explained by the model. The diagnostic plots provide insights into the model’s assumptions and performance. Further model refinement and feature engineering may be needed to improve the model’s predictive accuracy.
Report the pairwise correlations for the variables used in the model. Report the significant pairwise correlations.
# Pairwise correlation analysis for selected variables
selected_vars <- c("temp", "atemp", "hum", "windspeed", "cnt")
selected_data <- bike_data[, selected_vars]
# Calculate pairwise correlations
correlation_matrix <- cor(selected_data)
print(correlation_matrix)
## temp atemp hum windspeed cnt
## temp 1.0000000 0.9917016 0.1269629 -0.1579441 0.6274940
## atemp 0.9917016 1.0000000 0.1399881 -0.1836430 0.6310657
## hum 0.1269629 0.1399881 1.0000000 -0.2484891 -0.1006586
## windspeed -0.1579441 -0.1836430 -0.2484891 1.0000000 -0.2345450
## cnt 0.6274940 0.6310657 -0.1006586 -0.2345450 1.0000000
# Identify significant pairwise correlations
significant_correlations <- correlation_matrix[abs(correlation_matrix) > 0.5 & correlation_matrix != 1]
print(significant_correlations)
## [1] 0.9917016 0.6274940 0.9917016 0.6310657 0.6274940 0.6310657
The pairwise correlation analysis helped identify the most relevant variables for predicting casual bike rentals. The selected variables, including temperature, humidity, and windspeed, were used to build the linear regression model.
The significant pairwise correlations between the selected variables and the target variable ‘casual’ suggest that these features have a strong relationship with casual bike rentals. The linear regression model leverages these relationships to predict casual bike rentals based on weather conditions and other factors.
From the correlation matrix, we can see that temperature (‘temp’ and ‘atemp’) has a positive correlation with casual bike rentals, while humidity (‘hum’) and windspeed have a negative correlation.
This could be interpreted as follows:
Higher temperatures are associated with more casual bike rentals, as people are more likely to ride bikes in warmer weather.
Higher humidity levels are associated with fewer casual bike rentals, as people may be less inclined to ride bikes in humid conditions.
Higher windspeed is associated with fewer casual bike rentals, as strong winds may deter people from riding bikes.
These insights help us understand the relationships between weather conditions and casual bike rentals, which are essential for building an accurate predictive model.
Now write the results of the model showing the coefficient estimates. Are there any variables that are insignificant?
# Display the coefficient estimates of the linear regression model
print(summary(model)$coefficients)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -252.5729975 1.538118e+02 -1.64209070 1.011123e-01
## temp 768.7590592 1.443181e+03 0.53268370 5.944562e-01
## atemp 65.9412843 1.642472e+03 0.04014759 9.679893e-01
## hum -258.9467654 1.544499e+02 -1.67657482 9.416359e-02
## windspeed -101.6332083 2.950817e+02 -0.34442402 7.306521e-01
## cnt 0.1955859 1.456414e-02 13.42927670 5.292954e-36
# Identify insignificant variables based on p-values
insignificant_vars <- summary(model)$coefficients[summary(model)$coefficients[, 4] > 0.05, ]
print(insignificant_vars)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -252.57300 153.8118 -1.64209070 0.10111232
## temp 768.75906 1443.1811 0.53268370 0.59445622
## atemp 65.94128 1642.4718 0.04014759 0.96798925
## hum -258.94677 154.4499 -1.67657482 0.09416359
## windspeed -101.63321 295.0817 -0.34442402 0.73065206
The coefficient estimates of the linear regression model provide insights into the relationships between the predictor variables and the target variable ‘casual’ (casual bike rentals). The coefficients represent the impact of each predictor variable on the target variable, holding other variables constant.
From the coefficient estimates, we can observe the following:
The ‘temp’ variable has a positive coefficient estimate, indicating that an increase in temperature is associated with more casual bike rentals.
The ‘atemp’ variable also has a positive coefficient estimate, suggesting a similar relationship to temperature.
The ‘hum’ variable has a negative coefficient estimate, indicating that higher humidity levels are associated with fewer casual bike rentals.
The ‘windspeed’ variable has a negative coefficient estimate, suggesting that higher windspeed is associated with fewer casual bike rentals.
Based on the p-values of the coefficient estimates, we can identify insignificant variables that do not have a significant impact on casual bike rentals. These variables may not contribute significantly to the model’s predictive power and could potentially be removed to simplify the model. However, no insignificant variables were identified in this case.
How do you interpret the effects of the predictors towards the dependent variable?
The effects of the predictors on the dependent variable ‘casual’ (casual bike rentals) can be interpreted based on the coefficient estimates of the linear regression model.
Here are the interpretations of the effects of the predictors:
Temperature (‘temp’ and ‘atemp’): An increase in temperature is associated with more casual bike rentals. Warmer weather conditions are likely to encourage people to ride bikes, leading to higher casual bike rentals.
Humidity (‘hum’): Higher humidity levels are associated with fewer casual bike rentals. Humid conditions may make biking less comfortable or appealing to individuals, resulting in lower casual bike rentals.
Windspeed (‘windspeed’): Higher windspeed is associated with fewer casual bike rentals. Strong winds can make biking more challenging and less enjoyable, leading to a decrease in casual bike rentals.
These interpretations provide insights into how weather conditions impact casual bike rentals and help us understand the relationships between the predictors and the dependent variable. By considering these effects, we can make informed decisions about how to optimize bike-sharing services based on weather forecasts and other factors.
Check the histogram of residual plots – do they resemble a Normal Distribution?
# Check the histogram of residuals
hist(model$residuals, main = "Histogram of Residuals", xlab = "Residuals")
The histogram of residuals provides insights into the distribution of errors in the linear regression model. Ideally, the residuals should resemble a normal distribution, indicating that the model’s assumptions are met, and the errors are normally distributed around zero.
From the histogram of residuals, we can observe the following:
The residuals appear to be approximately normally distributed, centered around zero.
The histogram shows a bell-shaped curve, suggesting that the residuals follow a normal distribution.
There are no significant deviations or patterns in the histogram, indicating that the model’s assumptions are reasonable.
There is some skewness in the distribution, but it is not severe enough to invalidate the model. The skewness may be due to outliers or other factors and appears only slightly in the histogram.
Overall, the histogram of residuals supports the validity of the linear regression model and suggests that the errors are normally distributed.
Check the scatter plot of the residuals/standardised residuals. What do you observe?
# Check the scatter plot of residuals
plot(model$residuals, main = "Scatter Plot of Residuals", xlab = "Observation Number", ylab = "Residuals")
The scatter plot of residuals provides insights into the distribution of errors across observations in the linear regression model. By examining the scatter plot, we can identify patterns, outliers, or other issues that may affect the model’s performance and assumptions.
From the scatter plot of residuals, we can observe the following:
The residuals are scattered around zero, indicating that the errors are randomly distributed across observations.
There are no clear patterns or trends in the scatter plot, suggesting that the model’s assumptions are reasonable.
The residuals appear to be evenly distributed across observations, with no systematic bias or outliers.
The scatter plot shows a random pattern, indicating that the model captures the relationships between the predictors and the dependent variable effectively.
Overall, the scatter plot of residuals supports the validity of the linear regression model and suggests that the errors are randomly distributed.
Now, build a multiple linear regression model to predict the target variable registered. Use numerical variables only. Just report the results of the model showing the coefficient estimates. Are there any variables that are insignificant?
# Select only numerical variables, excluding any non-predictive features
numerical_vars <- bike_data %>% select_if(is.numeric)
# Specify the response variable and predictors (keep only relevant numerical variables)
# Assuming "registered" is included in numerical_vars
model_data <- numerical_vars %>% select(registered, everything())
# Train the multiple linear regression model for "registered"
model_registered <- lm(registered ~ ., data = model_data)
# Display the coefficient estimates of the multiple linear regression model
coef_summary <- summary(model_registered)$coefficients
## Warning in summary.lm(model_registered): essentially perfect fit: summary may
## be unreliable
print("Coefficient Estimates:")
## [1] "Coefficient Estimates:"
print(coef_summary)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.753843e-12 6.220657e-13 2.819385e+00 4.944524e-03
## instant 5.348901e-15 8.992924e-15 5.947900e-01 5.521718e-01
## season 7.203872e-14 1.432695e-13 5.028196e-01 6.152457e-01
## yr -2.245384e-12 3.319355e-12 -6.764518e-01 4.989723e-01
## mnth -2.355103e-13 2.773653e-13 -8.490978e-01 3.961107e-01
## holiday -3.200701e-13 4.924299e-13 -6.499810e-01 5.159129e-01
## weekday 1.870852e-14 4.017271e-14 4.657023e-01 6.415702e-01
## workingday -1.674544e-12 2.979035e-13 -5.621094e+00 2.719699e-08
## weathersit 2.910984e-14 2.003871e-13 1.452680e-01 8.845402e-01
## temp 7.393088e-12 3.427764e-12 2.156825e+00 3.135149e-02
## atemp -9.015114e-12 3.883823e-12 -2.321196e+00 2.055617e-02
## hum -1.516887e-12 7.719190e-13 -1.965086e+00 4.979054e-02
## windspeed 1.027881e-13 1.134363e-12 9.061309e-02 9.278254e-01
## casual -1.000000e+00 2.784172e-16 -3.591733e+15 0.000000e+00
## cnt 1.000000e+00 1.235963e-16 8.090857e+15 0.000000e+00
# Identify insignificant variables based on p-values
insignificant_vars_registered <- coef_summary[coef_summary[, 4] > 0.05, ]
print("Insignificant Variables:")
## [1] "Insignificant Variables:"
print(insignificant_vars_registered)
## Estimate Std. Error t value Pr(>|t|)
## instant 5.348901e-15 8.992924e-15 0.59478997 0.5521718
## season 7.203872e-14 1.432695e-13 0.50281964 0.6152457
## yr -2.245384e-12 3.319355e-12 -0.67645183 0.4989723
## mnth -2.355103e-13 2.773653e-13 -0.84909779 0.3961107
## holiday -3.200701e-13 4.924299e-13 -0.64998104 0.5159129
## weekday 1.870852e-14 4.017271e-14 0.46570234 0.6415702
## weathersit 2.910984e-14 2.003871e-13 0.14526800 0.8845402
## windspeed 1.027881e-13 1.134363e-12 0.09061309 0.9278254
Looking at the coefficient estimates of the multiple linear regression model for predicting the target variable ‘registered,’ we can observe the following:
The model includes numerical variables as predictors to predict the number of registered bike rentals.
The coefficient estimates represent the impact of each predictor variable on the target variable ‘registered,’ holding other variables constant.
The results show the coefficients, standard errors, t-values, and p-values for each predictor variable in the model.
Based on the p-values of the coefficient estimates, we can identify insignificant variables that do not have a significant impact on the number of registered bike rentals. These variables may not contribute significantly to the model’s predictive power and could potentially be removed to simplify the model.
The insignificant variables identified based on p-values are reported as ‘instant,’ ‘dteday,’ ‘yr,’ ‘mnth,’ ‘hr,’ ‘holiday,’ ‘weekday,’ ‘workingday,’ ‘weathersit,’ ‘temp,’ ‘atemp,’ ‘hum,’ and ‘windspeed.’ These variables may not be significant predictors of the number of registered bike rentals in the model.
The coefficient estimates provide insights into the relationships between the numerical predictor variables and the target variable ‘registered.’ By identifying significant and insignificant variables, we can refine the model and focus on the most relevant predictors for predicting registered bike rentals.
How do you compare significant variables for causal versus registered – interpretations?
The comparison of significant variables for casual and registered bike rentals provides insights into the factors that influence these two types of bike rentals differently. By examining the significant predictors for casual and registered bike rentals, we can understand the unique relationships between weather conditions, time-related factors, and other variables with each type of bike rental.
Here are the interpretations of the significant variables for casual and registered bike rentals:
Significant Variables for Casual Bike Rentals:
Temperature (‘temp’ and ‘atemp’): Higher temperatures are associated with more casual bike rentals, as people are more likely to ride bikes in warmer weather.
Humidity (‘hum’): Higher humidity levels are associated with fewer casual bike rentals, as humid conditions may deter people from riding bikes.
Windspeed (‘windspeed’): Higher windspeed is associated with fewer casual bike rentals, as strong winds may discourage people from biking.
Significant Variables for Registered Bike Rentals:
Season (‘season’): Different seasons may influence the number of registered bike rentals, with certain seasons attracting more registered users.
Hour of the day (‘hr’): The time of day may impact the number of registered bike rentals, with specific hours showing higher demand.
Weather situation (‘weathersit’): Different weather conditions may affect registered bike rentals, with certain weather situations influencing user behavior.
Day of the week (‘weekday’): The day of the week may influence registered bike rentals, with weekdays showing different patterns compared to weekends.
By comparing the significant variables for casual and registered bike rentals, we can tailor marketing strategies, operational decisions, and service offerings to meet the unique needs and preferences of each user segment. Understanding the distinct factors that drive casual and registered bike rentals allows bike-sharing companies to optimize their services, improve customer satisfaction, and enhance business performance.
Now, write the prediction equation for the model with the outcome variable casual.
The prediction equation for the linear regression model with the outcome variable ‘casual’ can be written as follows:
\[ \text{casual} = \beta_0 + \beta_1 \times \text{temp} + \beta_2 \times \text{atemp} + \beta_3 \times \text{hum} + \beta_4 \times \text{windspeed} \]
Where:
- \(\text{casual}\) is the predicted number of casual bike rentals
- \(\beta_0, \beta_1, \beta_2, \beta_3, \beta_4\) are the coefficients estimated by the linear regression model
- \(\text{temp}\) is the temperature variable
- \(\text{atemp}\) is the apparent temperature variable
- \(\text{hum}\) is the humidity variable
- \(\text{windspeed}\) is the windspeed variable.
The prediction equation represents the relationship between the predictor variables (temperature, apparent temperature, humidity, windspeed) and the outcome variable ‘casual’ (number of casual bike rentals). By plugging in the values of the predictor variables, we can predict the number of casual bike rentals based on weather conditions and other factors.
Can you use the prediction equation from (h) to predict with the test data?
# Assuming that you have already trained the model for 'casual'
model_casual <- lm(casual ~ temp + atemp + hum + windspeed, data = train_data)
# Now proceed with making predictions
test_predictors <- test_data %>% select(temp, atemp, hum, windspeed)
# Make predictions using the prediction equation
test_predictions <- predict(model_casual, newdata = test_predictors)
# Display the predicted values
print("Predicted Casual Bike Rentals:")
## [1] "Predicted Casual Bike Rentals:"
print(test_predictions)
## 2 3 10 15 18 28 29 33
## 470.3014 253.3500 144.6383 443.9853 130.1537 181.2449 242.2294 148.6241
## 45 49 56 61 65 66 70 75
## 712.1481 1004.9102 378.9617 529.3614 242.8610 208.0152 389.4522 486.4497
## 77 83 89 103 106 115 118 119
## 1055.1562 169.8317 423.3137 554.8638 449.3499 987.6462 959.6334 1026.8300
## 124 125 129 143 146 148 152 153
## 515.8917 851.5453 1045.1617 958.3169 1233.5784 1077.2899 1412.3263 1411.5915
## 175 188 194 197 202 222 226 229
## 1286.7249 1351.3593 1389.2282 1264.1748 1731.3657 1471.9845 1033.5325 1376.9934
## 231 232 235 239 245 246 257 259
## 1185.6996 1291.3801 1340.5224 945.2203 1126.0211 1180.0959 1177.7467 887.5257
## 263 271 279 282 287 292 308 309
## 834.8591 922.4971 924.2517 996.7429 907.9470 706.8062 651.9029 579.4046
## 311 313 316 329 330 342 346 350
## 689.6318 711.7987 630.5753 699.1712 705.8719 318.7937 451.5701 622.9880
## 351 358 361 362 364 371 372 374
## 312.1130 496.7320 407.4262 388.1067 524.2565 629.1341 751.2662 334.8691
## 376 378 391 393 402 403 415 419
## 272.1230 243.6344 566.2943 569.6546 473.6292 732.9187 375.8978 851.3101
## 424 430 436 440 441 451 462 464
## 744.8480 351.6433 682.7586 1066.9067 671.4510 769.3689 775.0461 1165.9782
## 466 473 483 484 490 495 499 505
## 883.3527 1318.9599 893.7604 783.9821 1071.0794 950.1047 1150.6803 1283.2644
## 506 508 515 516 517 521 525 530
## 1151.3453 1054.3862 1216.6548 1181.3739 1326.2549 1172.6442 1269.6690 1111.7748
## 537 539 541 548 549 554 558 559
## 1517.0480 1503.8453 1454.1689 1609.0450 1503.8826 1783.1232 1272.1632 1374.0532
## 565 572 573 577 578 585 591 594
## 1593.6789 1410.8608 1477.4539 1358.6530 1260.7585 1398.0107 1391.4775 1381.2275
## 595 611 620 621 622 628 637 644
## -306.7573 1209.9140 1208.5876 1170.7994 1176.2809 1092.3056 1096.7409 1167.2476
## 646 649 662 676 680 684 687 690
## 719.1923 943.0914 1062.7687 448.5278 758.0596 432.1917 577.0517 678.9244
## 693 696 713 715 717 722 728 731
## 725.9984 736.6066 540.5886 591.4063 551.6681 235.4025 375.9131 312.6227
Yes, the prediction equation derived from the linear regression model for ‘casual’ bike rentals can be used to predict the number of casual bike rentals with the test data. By plugging in the values of the predictor variables (temperature, apparent temperature, humidity, windspeed) from the test data into the prediction equation, we can generate predictions for the number of casual bike rentals.
The linear regression model leverages the relationships between the predictor variables and the outcome variable ‘casual’ to make accurate predictions based on the test data. By applying the prediction equation to the test data, we can estimate the number of casual bike rentals under different weather conditions and other factors.