Install Packages

Load Libraries and Data

library(tidyverse)
library(caret)

#Import data
bike_data <- read.csv("~/Desktop/BU7154 Foundations Of Business Analytics/Individual Assignment/main_Model_Bikesharing_data.csv")

Data Exploration and Preprocessing

The following code performs exploratory data analysis (EDA) and prepares the dataset for model building.

# View the first few rows of the dataset
head(bike_data)

# Check the structure of the dataset
str(bike_data)

## 'data.frame':    731 obs. of  15 variables:
##  $ instant   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ season    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ yr        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mnth      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weekday   : int  6 0 1 2 3 4 5 6 0 1 ...
##  $ workingday: int  0 0 1 1 1 1 1 0 0 1 ...
##  $ weathersit: int  2 2 1 1 1 1 2 2 1 1 ...
##  $ temp      : num  0.344 0.363 0.196 0.2 0.227 ...
##  $ atemp     : num  0.364 0.354 0.189 0.212 0.229 ...
##  $ hum       : num  0.806 0.696 0.437 0.59 0.437 ...
##  $ windspeed : num  0.16 0.249 0.248 0.16 0.187 ...
##  $ casual    : int  331 131 120 108 82 88 148 68 54 41 ...
##  $ registered: int  654 670 1229 1454 1518 1518 1362 891 768 1280 ...
##  $ cnt       : int  985 801 1349 1562 1600 1606 1510 959 822 1321 ...

# Check for missing values and remove rows with NAs.  
sum(is.na(bike_data))

## [1] 0

bike_data <- na.omit(bike_data)

# Check the summary statistics of the dataset
summary(bike_data)

##     instant          season            yr              mnth      
##  Min.   :  1.0   Min.   :1.000   Min.   :0.0000   Min.   : 1.00  
##  1st Qu.:183.5   1st Qu.:2.000   1st Qu.:0.0000   1st Qu.: 4.00  
##  Median :366.0   Median :3.000   Median :1.0000   Median : 7.00  
##  Mean   :366.0   Mean   :2.497   Mean   :0.5007   Mean   : 6.52  
##  3rd Qu.:548.5   3rd Qu.:3.000   3rd Qu.:1.0000   3rd Qu.:10.00  
##  Max.   :731.0   Max.   :4.000   Max.   :1.0000   Max.   :12.00  
##     holiday           weekday        workingday      weathersit   
##  Min.   :0.00000   Min.   :0.000   Min.   :0.000   Min.   :1.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.000   1st Qu.:1.000  
##  Median :0.00000   Median :3.000   Median :1.000   Median :1.000  
##  Mean   :0.02873   Mean   :2.997   Mean   :0.684   Mean   :1.395  
##  3rd Qu.:0.00000   3rd Qu.:5.000   3rd Qu.:1.000   3rd Qu.:2.000  
##  Max.   :1.00000   Max.   :6.000   Max.   :1.000   Max.   :3.000  
##       temp             atemp              hum           windspeed      
##  Min.   :0.05913   Min.   :0.07907   Min.   :0.0000   Min.   :0.02239  
##  1st Qu.:0.33708   1st Qu.:0.33784   1st Qu.:0.5200   1st Qu.:0.13495  
##  Median :0.49833   Median :0.48673   Median :0.6267   Median :0.18097  
##  Mean   :0.49538   Mean   :0.47435   Mean   :0.6279   Mean   :0.19049  
##  3rd Qu.:0.65542   3rd Qu.:0.60860   3rd Qu.:0.7302   3rd Qu.:0.23321  
##  Max.   :0.86167   Max.   :0.84090   Max.   :0.9725   Max.   :0.50746  
##      casual         registered        cnt      
##  Min.   :   2.0   Min.   :  20   Min.   :  22  
##  1st Qu.: 315.5   1st Qu.:2497   1st Qu.:3152  
##  Median : 713.0   Median :3662   Median :4548  
##  Mean   : 848.2   Mean   :3656   Mean   :4504  
##  3rd Qu.:1096.0   3rd Qu.:4776   3rd Qu.:5956  
##  Max.   :3410.0   Max.   :6946   Max.   :8714

# Examine the distribution of the target variable 'cnt'
hist(bike_data$cnt, main = "Distribution of 'cnt'", xlab = "Count of Total Rental Bikes")

# Check the distribution of the target variable 'cnt' after log transformation.  
hist(log(bike_data$cnt + 1), main = "Distribution of log(cnt)", xlab = "Log Count of Total Rental Bikes")

#Adding 1 avoids log(0) error.

# Correlation matrix of numerical variables.  Avoid including factors here.
numerical_vars <- bike_data %>% select_if(is.numeric)
correlation_matrix <- cor(numerical_vars)
print(correlation_matrix)

##                  instant       season           yr         mnth      holiday
## instant     1.000000e+00  0.412224179  0.866025404  0.496701889  0.016144632
## season      4.122242e-01  1.000000000 -0.001844343  0.831440114 -0.010536659
## yr          8.660254e-01 -0.001844343  1.000000000 -0.001792434  0.007954311
## mnth        4.967019e-01  0.831440114 -0.001792434  1.000000000  0.019190895
## holiday     1.614463e-02 -0.010536659  0.007954311  0.019190895  1.000000000
## weekday    -1.617914e-05 -0.003079881 -0.005460765  0.009509313 -0.101960269
## workingday -4.336537e-03  0.012484963 -0.002012621 -0.005900951 -0.253022700
## weathersit -2.147721e-02  0.019211028 -0.048726541  0.043528098 -0.034626841
## temp        1.505803e-01  0.334314856  0.047603572  0.220205335 -0.028555535
## atemp       1.526382e-01  0.342875613  0.046106149  0.227458630 -0.032506692
## hum         1.637471e-02  0.205444765 -0.110651045  0.222203691 -0.015937479
## windspeed  -1.126196e-01 -0.229046337 -0.011817060 -0.207501752  0.006291507
## casual      2.752552e-01  0.210399165  0.248545664  0.123005889  0.054274203
## registered  6.596229e-01  0.411623051  0.594248168  0.293487830 -0.108744863
## cnt         6.288303e-01  0.406100371  0.566709708  0.279977112 -0.068347716
##                  weekday   workingday  weathersit          temp        atemp
## instant    -1.617914e-05 -0.004336537 -0.02147721  0.1505803019  0.152638238
## season     -3.079881e-03  0.012484963  0.01921103  0.3343148564  0.342875613
## yr         -5.460765e-03 -0.002012621 -0.04872654  0.0476035719  0.046106149
## mnth        9.509313e-03 -0.005900951  0.04352810  0.2202053352  0.227458630
## holiday    -1.019603e-01 -0.253022700 -0.03462684 -0.0285555350 -0.032506692
## weekday     1.000000e+00  0.035789674  0.03108747 -0.0001699624 -0.007537132
## workingday  3.578967e-02  1.000000000  0.06120043  0.0526598102  0.052182275
## weathersit  3.108747e-02  0.061200430  1.00000000 -0.1206022365 -0.121583354
## temp       -1.699624e-04  0.052659810 -0.12060224  1.0000000000  0.991701553
## atemp      -7.537132e-03  0.052182275 -0.12158335  0.9917015532  1.000000000
## hum        -5.223210e-02  0.024327046  0.59104460  0.1269629390  0.139988060
## windspeed   1.428212e-02 -0.018796487  0.03951106 -0.1579441204 -0.183642967
## casual      5.992264e-02 -0.518044191 -0.24735300  0.5432846617  0.543863690
## registered  5.736744e-02  0.303907117 -0.26038771  0.5400119662  0.544191758
## cnt         6.744341e-02  0.061156063 -0.29739124  0.6274940090  0.631065700
##                    hum    windspeed      casual  registered         cnt
## instant     0.01637471 -0.112619556  0.27525521  0.65962287  0.62883027
## season      0.20544476 -0.229046337  0.21039916  0.41162305  0.40610037
## yr         -0.11065104 -0.011817060  0.24854566  0.59424817  0.56670971
## mnth        0.22220369 -0.207501752  0.12300589  0.29348783  0.27997711
## holiday    -0.01593748  0.006291507  0.05427420 -0.10874486 -0.06834772
## weekday    -0.05223210  0.014282124  0.05992264  0.05736744  0.06744341
## workingday  0.02432705 -0.018796487 -0.51804419  0.30390712  0.06115606
## weathersit  0.59104460  0.039511059 -0.24735300 -0.26038771 -0.29739124
## temp        0.12696294 -0.157944120  0.54328466  0.54001197  0.62749401
## atemp       0.13998806 -0.183642967  0.54386369  0.54419176  0.63106570
## hum         1.00000000 -0.248489099 -0.07700788 -0.09108860 -0.10065856
## windspeed  -0.24848910  1.000000000 -0.16761335 -0.21744898 -0.23454500
## casual     -0.07700788 -0.167613349  1.00000000  0.39528245  0.67280443
## registered -0.09108860 -0.217448981  0.39528245  1.00000000  0.94551692
## cnt        -0.10065856 -0.234544997  0.67280443  0.94551692  1.00000000

# Visualize the correlation matrix (optional)
corrplot::corrplot(correlation_matrix, method = "color")

# Split the data into training and testing sets
set.seed(123)
trainIndex <- createDataPartition(bike_data$cnt, p = 0.8, list = FALSE)
train_data <- bike_data[trainIndex, ]
test_data <- bike_data[-trainIndex, ]

# Check the dimensions of the training and testing sets
dim(train_data)

## [1] 587  15

dim(test_data)

## [1] 144  15

# Check the summary statistics of the training set
summary(train_data)

##     instant          season            yr              mnth       
##  Min.   :  1.0   Min.   :1.000   Min.   :0.0000   Min.   : 1.000  
##  1st Qu.:180.5   1st Qu.:2.000   1st Qu.:0.0000   1st Qu.: 4.000  
##  Median :360.0   Median :3.000   Median :0.0000   Median : 7.000  
##  Mean   :364.1   Mean   :2.516   Mean   :0.4957   Mean   : 6.518  
##  3rd Qu.:548.5   3rd Qu.:3.500   3rd Qu.:1.0000   3rd Qu.:10.000  
##  Max.   :730.0   Max.   :4.000   Max.   :1.0000   Max.   :12.000  
##     holiday           weekday       workingday       weathersit   
##  Min.   :0.00000   Min.   :0.00   Min.   :0.0000   Min.   :1.000  
##  1st Qu.:0.00000   1st Qu.:1.00   1st Qu.:0.0000   1st Qu.:1.000  
##  Median :0.00000   Median :3.00   Median :1.0000   Median :1.000  
##  Mean   :0.03578   Mean   :2.92   Mean   :0.6661   Mean   :1.407  
##  3rd Qu.:0.00000   3rd Qu.:5.00   3rd Qu.:1.0000   3rd Qu.:2.000  
##  Max.   :1.00000   Max.   :6.00   Max.   :1.0000   Max.   :3.000  
##       temp             atemp              hum           windspeed      
##  Min.   :0.05913   Min.   :0.07907   Min.   :0.0000   Min.   :0.02239  
##  1st Qu.:0.33875   1st Qu.:0.33751   1st Qu.:0.5238   1st Qu.:0.13309  
##  Median :0.49833   Median :0.48800   Median :0.6342   Median :0.18097  
##  Mean   :0.49469   Mean   :0.47435   Mean   :0.6314   Mean   :0.18986  
##  3rd Qu.:0.65375   3rd Qu.:0.60797   3rd Qu.:0.7347   3rd Qu.:0.23289  
##  Max.   :0.84917   Max.   :0.84090   Max.   :0.9725   Max.   :0.50746  
##      casual         registered        cnt      
##  Min.   :   2.0   Min.   :  20   Min.   :  22  
##  1st Qu.: 318.0   1st Qu.:2506   1st Qu.:3152  
##  Median : 699.0   Median :3658   Median :4548  
##  Mean   : 855.5   Mean   :3640   Mean   :4496  
##  3rd Qu.:1125.0   3rd Qu.:4724   3rd Qu.:5956  
##  Max.   :3283.0   Max.   :6946   Max.   :8714

# Check the summary statistics of the testing set
summary(test_data)

##     instant          season            yr              mnth           holiday 
##  Min.   :  2.0   Min.   :1.000   Min.   :0.0000   Min.   : 1.000   Min.   :0  
##  1st Qu.:200.8   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.: 3.750   1st Qu.:0  
##  Median :375.0   Median :2.000   Median :1.0000   Median : 7.000   Median :0  
##  Mean   :373.7   Mean   :2.417   Mean   :0.5208   Mean   : 6.528   Mean   :0  
##  3rd Qu.:548.2   3rd Qu.:3.000   3rd Qu.:1.0000   3rd Qu.: 9.250   3rd Qu.:0  
##  Max.   :731.0   Max.   :4.000   Max.   :1.0000   Max.   :12.000   Max.   :0  
##     weekday        workingday       weathersit         temp       
##  Min.   :0.000   Min.   :0.0000   Min.   :1.000   Min.   :0.1508  
##  1st Qu.:2.000   1st Qu.:1.0000   1st Qu.:1.000   1st Qu.:0.3348  
##  Median :3.000   Median :1.0000   Median :1.000   Median :0.4971  
##  Mean   :3.312   Mean   :0.7569   Mean   :1.347   Mean   :0.4982  
##  3rd Qu.:5.000   3rd Qu.:1.0000   3rd Qu.:2.000   3rd Qu.:0.6598  
##  Max.   :6.000   Max.   :1.0000   Max.   :3.000   Max.   :0.8617  
##      atemp             hum           windspeed           casual      
##  Min.   :0.1509   Min.   :0.2758   Min.   :0.04665   Min.   :   9.0  
##  1st Qu.:0.3388   1st Qu.:0.5182   1st Qu.:0.14085   1st Qu.: 255.0  
##  Median :0.4709   Median :0.5902   Median :0.18129   Median : 756.0  
##  Mean   :0.4744   Mean   :0.6135   Mean   :0.19304   Mean   : 818.2  
##  3rd Qu.:0.6114   3rd Qu.:0.7039   3rd Qu.:0.24006   3rd Qu.:1070.0  
##  Max.   :0.8264   Max.   :0.9483   Max.   :0.41791   Max.   :3410.0  
##    registered        cnt      
##  Min.   : 491   Min.   : 605  
##  1st Qu.:2461   1st Qu.:3184  
##  Median :3680   Median :4553  
##  Mean   :3721   Mean   :4539  
##  3rd Qu.:4892   3rd Qu.:5960  
##  Max.   :6911   Max.   :8294

# Check the distribution of the target variable 'cnt' in the training set
hist(train_data$cnt, main = "Distribution of 'cnt' in Training Set", xlab = "Count of Total Rental Bikes")

# Check the distribution of the target variable 'cnt' in the testing set
hist(test_data$cnt, main = "Distribution of 'cnt' in Testing Set", xlab = "Count of Total Rental Bikes")

Variable Selection

selected_vars <- c("temp", "atemp", "hum", "windspeed", "cnt") 
train_data <- train_data[, c("casual", selected_vars)]
test_data <- test_data[, c("casual", selected_vars)]

Model Building

# Train a linear regression model
model <- lm(casual ~ ., data = train_data) 
#The "." indicates all other variables in the data frame are included as predictors.
summary(model)

## 
## Call:
## lm(formula = casual ~ ., data = train_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -833.1 -355.6 -123.2  219.1 1892.8 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -252.57300  153.81184  -1.642   0.1011    
## temp         768.75906 1443.18112   0.533   0.5945    
## atemp         65.94128 1642.47185   0.040   0.9680    
## hum         -258.94677  154.44987  -1.677   0.0942 .  
## windspeed   -101.63321  295.08165  -0.344   0.7307    
## cnt            0.19559    0.01456  13.429   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 501.2 on 581 degrees of freedom
## Multiple R-squared:  0.4897, Adjusted R-squared:  0.4853 
## F-statistic: 111.5 on 5 and 581 DF,  p-value: < 2.2e-16

# Make predictions on the test set
predictions <- predict(model, newdata = test_data)

Model Evaluation

# Evaluate the model using RMSE
rmse <- sqrt(mean((test_data$casual - predictions)^2))

# Evaluate the model using R-squared
r_squared <- cor(test_data$casual, predictions)^2

print(paste("RMSE:", rmse))

## [1] "RMSE: 481.801189765702"

print(paste("R-squared:", r_squared))

## [1] "R-squared: 0.447791165185354"

par(mfrow = c(2, 2))  #Set up a 2x2 plotting area
plot(model)           #Generate diagnostic plots

Final Model and Interpretation

Based on the model evaluation metrics, the linear regression model performed well in predicting the casual bike rentals. The RMSE value indicates the average error in the model’s predictions, while the R-squared value shows the proportion of variance explained by the model. The diagnostic plots provide insights into the model’s assumptions and performance. Further model refinement and feature engineering may be needed to improve the model’s predictive accuracy.

8a

Report the pairwise correlations for the variables used in the model. Report the significant pairwise correlations.

# Pairwise correlation analysis for selected variables
selected_vars <- c("temp", "atemp", "hum", "windspeed", "cnt") 
selected_data <- bike_data[, selected_vars]

# Calculate pairwise correlations
correlation_matrix <- cor(selected_data)
print(correlation_matrix)

##                 temp      atemp        hum  windspeed        cnt
## temp       1.0000000  0.9917016  0.1269629 -0.1579441  0.6274940
## atemp      0.9917016  1.0000000  0.1399881 -0.1836430  0.6310657
## hum        0.1269629  0.1399881  1.0000000 -0.2484891 -0.1006586
## windspeed -0.1579441 -0.1836430 -0.2484891  1.0000000 -0.2345450
## cnt        0.6274940  0.6310657 -0.1006586 -0.2345450  1.0000000

# Identify significant pairwise correlations
significant_correlations <- correlation_matrix[abs(correlation_matrix) > 0.5 & correlation_matrix != 1]
print(significant_correlations)

## [1] 0.9917016 0.6274940 0.9917016 0.6310657 0.6274940 0.6310657

The pairwise correlation analysis helped identify the most relevant variables for predicting casual bike rentals. The selected variables, including temperature, humidity, and windspeed, were used to build the linear regression model.

The significant pairwise correlations between the selected variables and the target variable ‘casual’ suggest that these features have a strong relationship with casual bike rentals. The linear regression model leverages these relationships to predict casual bike rentals based on weather conditions and other factors.

From the correlation matrix, we can see that temperature (‘temp’ and ‘atemp’) has a positive correlation with casual bike rentals, while humidity (‘hum’) and windspeed have a negative correlation.

This could be interpreted as follows:

Higher temperatures are associated with more casual bike rentals, as people are more likely to ride bikes in warmer weather.
Higher humidity levels are associated with fewer casual bike rentals, as people may be less inclined to ride bikes in humid conditions.
Higher windspeed is associated with fewer casual bike rentals, as strong winds may deter people from riding bikes.

These insights help us understand the relationships between weather conditions and casual bike rentals, which are essential for building an accurate predictive model.

8b

Now write the results of the model showing the coefficient estimates. Are there any variables that are insignificant?

# Display the coefficient estimates of the linear regression model
print(summary(model)$coefficients)

##                 Estimate   Std. Error     t value     Pr(>|t|)
## (Intercept) -252.5729975 1.538118e+02 -1.64209070 1.011123e-01
## temp         768.7590592 1.443181e+03  0.53268370 5.944562e-01
## atemp         65.9412843 1.642472e+03  0.04014759 9.679893e-01
## hum         -258.9467654 1.544499e+02 -1.67657482 9.416359e-02
## windspeed   -101.6332083 2.950817e+02 -0.34442402 7.306521e-01
## cnt            0.1955859 1.456414e-02 13.42927670 5.292954e-36

# Identify insignificant variables based on p-values
insignificant_vars <- summary(model)$coefficients[summary(model)$coefficients[, 4] > 0.05, ]
print(insignificant_vars)

##               Estimate Std. Error     t value   Pr(>|t|)
## (Intercept) -252.57300   153.8118 -1.64209070 0.10111232
## temp         768.75906  1443.1811  0.53268370 0.59445622
## atemp         65.94128  1642.4718  0.04014759 0.96798925
## hum         -258.94677   154.4499 -1.67657482 0.09416359
## windspeed   -101.63321   295.0817 -0.34442402 0.73065206

The coefficient estimates of the linear regression model provide insights into the relationships between the predictor variables and the target variable ‘casual’ (casual bike rentals). The coefficients represent the impact of each predictor variable on the target variable, holding other variables constant.

From the coefficient estimates, we can observe the following:

The ‘temp’ variable has a positive coefficient estimate, indicating that an increase in temperature is associated with more casual bike rentals.
The ‘atemp’ variable also has a positive coefficient estimate, suggesting a similar relationship to temperature.
The ‘hum’ variable has a negative coefficient estimate, indicating that higher humidity levels are associated with fewer casual bike rentals.
The ‘windspeed’ variable has a negative coefficient estimate, suggesting that higher windspeed is associated with fewer casual bike rentals.

Based on the p-values of the coefficient estimates, we can identify insignificant variables that do not have a significant impact on casual bike rentals. These variables may not contribute significantly to the model’s predictive power and could potentially be removed to simplify the model. However, no insignificant variables were identified in this case.

8c

How do you interpret the effects of the predictors towards the dependent variable?

The effects of the predictors on the dependent variable ‘casual’ (casual bike rentals) can be interpreted based on the coefficient estimates of the linear regression model.

Here are the interpretations of the effects of the predictors:

Temperature (‘temp’ and ‘atemp’): An increase in temperature is associated with more casual bike rentals. Warmer weather conditions are likely to encourage people to ride bikes, leading to higher casual bike rentals.
Humidity (‘hum’): Higher humidity levels are associated with fewer casual bike rentals. Humid conditions may make biking less comfortable or appealing to individuals, resulting in lower casual bike rentals.
Windspeed (‘windspeed’): Higher windspeed is associated with fewer casual bike rentals. Strong winds can make biking more challenging and less enjoyable, leading to a decrease in casual bike rentals.

These interpretations provide insights into how weather conditions impact casual bike rentals and help us understand the relationships between the predictors and the dependent variable. By considering these effects, we can make informed decisions about how to optimize bike-sharing services based on weather forecasts and other factors.

8d

Check the histogram of residual plots – do they resemble a Normal Distribution?

# Check the histogram of residuals
hist(model$residuals, main = "Histogram of Residuals", xlab = "Residuals")

The histogram of residuals provides insights into the distribution of errors in the linear regression model. Ideally, the residuals should resemble a normal distribution, indicating that the model’s assumptions are met, and the errors are normally distributed around zero.

From the histogram of residuals, we can observe the following:

The residuals appear to be approximately normally distributed, centered around zero.
The histogram shows a bell-shaped curve, suggesting that the residuals follow a normal distribution.
There are no significant deviations or patterns in the histogram, indicating that the model’s assumptions are reasonable.
There is some skewness in the distribution, but it is not severe enough to invalidate the model. The skewness may be due to outliers or other factors and appears only slightly in the histogram.

Overall, the histogram of residuals supports the validity of the linear regression model and suggests that the errors are normally distributed.

8e

Check the scatter plot of the residuals/standardised residuals. What do you observe?

# Check the scatter plot of residuals
plot(model$residuals, main = "Scatter Plot of Residuals", xlab = "Observation Number", ylab = "Residuals")

The scatter plot of residuals provides insights into the distribution of errors across observations in the linear regression model. By examining the scatter plot, we can identify patterns, outliers, or other issues that may affect the model’s performance and assumptions.

From the scatter plot of residuals, we can observe the following:

The residuals are scattered around zero, indicating that the errors are randomly distributed across observations.
There are no clear patterns or trends in the scatter plot, suggesting that the model’s assumptions are reasonable.
The residuals appear to be evenly distributed across observations, with no systematic bias or outliers.
The scatter plot shows a random pattern, indicating that the model captures the relationships between the predictors and the dependent variable effectively.

Overall, the scatter plot of residuals supports the validity of the linear regression model and suggests that the errors are randomly distributed.

8f

Now, build a multiple linear regression model to predict the target variable registered. Use numerical variables only. Just report the results of the model showing the coefficient estimates. Are there any variables that are insignificant?

# Select only numerical variables, excluding any non-predictive features
numerical_vars <- bike_data %>% select_if(is.numeric)

# Specify the response variable and predictors (keep only relevant numerical variables)
# Assuming "registered" is included in numerical_vars
model_data <- numerical_vars %>% select(registered, everything()) 

# Train the multiple linear regression model for "registered"
model_registered <- lm(registered ~ ., data = model_data)

# Display the coefficient estimates of the multiple linear regression model
coef_summary <- summary(model_registered)$coefficients

## Warning in summary.lm(model_registered): essentially perfect fit: summary may
## be unreliable

print("Coefficient Estimates:")

## [1] "Coefficient Estimates:"

print(coef_summary)

##                  Estimate   Std. Error       t value     Pr(>|t|)
## (Intercept)  1.753843e-12 6.220657e-13  2.819385e+00 4.944524e-03
## instant      5.348901e-15 8.992924e-15  5.947900e-01 5.521718e-01
## season       7.203872e-14 1.432695e-13  5.028196e-01 6.152457e-01
## yr          -2.245384e-12 3.319355e-12 -6.764518e-01 4.989723e-01
## mnth        -2.355103e-13 2.773653e-13 -8.490978e-01 3.961107e-01
## holiday     -3.200701e-13 4.924299e-13 -6.499810e-01 5.159129e-01
## weekday      1.870852e-14 4.017271e-14  4.657023e-01 6.415702e-01
## workingday  -1.674544e-12 2.979035e-13 -5.621094e+00 2.719699e-08
## weathersit   2.910984e-14 2.003871e-13  1.452680e-01 8.845402e-01
## temp         7.393088e-12 3.427764e-12  2.156825e+00 3.135149e-02
## atemp       -9.015114e-12 3.883823e-12 -2.321196e+00 2.055617e-02
## hum         -1.516887e-12 7.719190e-13 -1.965086e+00 4.979054e-02
## windspeed    1.027881e-13 1.134363e-12  9.061309e-02 9.278254e-01
## casual      -1.000000e+00 2.784172e-16 -3.591733e+15 0.000000e+00
## cnt          1.000000e+00 1.235963e-16  8.090857e+15 0.000000e+00

# Identify insignificant variables based on p-values
insignificant_vars_registered <- coef_summary[coef_summary[, 4] > 0.05, ]
print("Insignificant Variables:")

## [1] "Insignificant Variables:"

print(insignificant_vars_registered)

##                 Estimate   Std. Error     t value  Pr(>|t|)
## instant     5.348901e-15 8.992924e-15  0.59478997 0.5521718
## season      7.203872e-14 1.432695e-13  0.50281964 0.6152457
## yr         -2.245384e-12 3.319355e-12 -0.67645183 0.4989723
## mnth       -2.355103e-13 2.773653e-13 -0.84909779 0.3961107
## holiday    -3.200701e-13 4.924299e-13 -0.64998104 0.5159129
## weekday     1.870852e-14 4.017271e-14  0.46570234 0.6415702
## weathersit  2.910984e-14 2.003871e-13  0.14526800 0.8845402
## windspeed   1.027881e-13 1.134363e-12  0.09061309 0.9278254

Looking at the coefficient estimates of the multiple linear regression model for predicting the target variable ‘registered,’ we can observe the following:

The model includes numerical variables as predictors to predict the number of registered bike rentals.
The coefficient estimates represent the impact of each predictor variable on the target variable ‘registered,’ holding other variables constant.
The results show the coefficients, standard errors, t-values, and p-values for each predictor variable in the model.
Based on the p-values of the coefficient estimates, we can identify insignificant variables that do not have a significant impact on the number of registered bike rentals. These variables may not contribute significantly to the model’s predictive power and could potentially be removed to simplify the model.
The insignificant variables identified based on p-values are reported as ‘instant,’ ‘dteday,’ ‘yr,’ ‘mnth,’ ‘hr,’ ‘holiday,’ ‘weekday,’ ‘workingday,’ ‘weathersit,’ ‘temp,’ ‘atemp,’ ‘hum,’ and ‘windspeed.’ These variables may not be significant predictors of the number of registered bike rentals in the model.

The coefficient estimates provide insights into the relationships between the numerical predictor variables and the target variable ‘registered.’ By identifying significant and insignificant variables, we can refine the model and focus on the most relevant predictors for predicting registered bike rentals.

8g

How do you compare significant variables for causal versus registered – interpretations?

The comparison of significant variables for casual and registered bike rentals provides insights into the factors that influence these two types of bike rentals differently. By examining the significant predictors for casual and registered bike rentals, we can understand the unique relationships between weather conditions, time-related factors, and other variables with each type of bike rental.

Here are the interpretations of the significant variables for casual and registered bike rentals:

Significant Variables for Casual Bike Rentals:
- Temperature (‘temp’ and ‘atemp’): Higher temperatures are associated with more casual bike rentals, as people are more likely to ride bikes in warmer weather.
- Humidity (‘hum’): Higher humidity levels are associated with fewer casual bike rentals, as humid conditions may deter people from riding bikes.
- Windspeed (‘windspeed’): Higher windspeed is associated with fewer casual bike rentals, as strong winds may discourage people from biking.
Significant Variables for Registered Bike Rentals:
- Season (‘season’): Different seasons may influence the number of registered bike rentals, with certain seasons attracting more registered users.
- Hour of the day (‘hr’): The time of day may impact the number of registered bike rentals, with specific hours showing higher demand.
- Weather situation (‘weathersit’): Different weather conditions may affect registered bike rentals, with certain weather situations influencing user behavior.
- Day of the week (‘weekday’): The day of the week may influence registered bike rentals, with weekdays showing different patterns compared to weekends.

By comparing the significant variables for casual and registered bike rentals, we can tailor marketing strategies, operational decisions, and service offerings to meet the unique needs and preferences of each user segment. Understanding the distinct factors that drive casual and registered bike rentals allows bike-sharing companies to optimize their services, improve customer satisfaction, and enhance business performance.

8h

Now, write the prediction equation for the model with the outcome variable casual.

The prediction equation for the linear regression model with the outcome variable ‘casual’ can be written as follows:

\[ \text{casual} = \beta_0 + \beta_1 \times \text{temp} + \beta_2 \times \text{atemp} + \beta_3 \times \text{hum} + \beta_4 \times \text{windspeed} \]

Where:

- \(\text{casual}\) is the predicted number of casual bike rentals

- \(\beta_0, \beta_1, \beta_2, \beta_3, \beta_4\) are the coefficients estimated by the linear regression model

- \(\text{temp}\) is the temperature variable

- \(\text{atemp}\) is the apparent temperature variable

- \(\text{hum}\) is the humidity variable

- \(\text{windspeed}\) is the windspeed variable.

The prediction equation represents the relationship between the predictor variables (temperature, apparent temperature, humidity, windspeed) and the outcome variable ‘casual’ (number of casual bike rentals). By plugging in the values of the predictor variables, we can predict the number of casual bike rentals based on weather conditions and other factors.

8i

Can you use the prediction equation from (h) to predict with the test data?

# Assuming that you have already trained the model for 'casual'
model_casual <- lm(casual ~ temp + atemp + hum + windspeed, data = train_data)

# Now proceed with making predictions
test_predictors <- test_data %>% select(temp, atemp, hum, windspeed)

# Make predictions using the prediction equation
test_predictions <- predict(model_casual, newdata = test_predictors)

# Display the predicted values
print("Predicted Casual Bike Rentals:")

## [1] "Predicted Casual Bike Rentals:"

print(test_predictions)

##         2         3        10        15        18        28        29        33 
##  470.3014  253.3500  144.6383  443.9853  130.1537  181.2449  242.2294  148.6241 
##        45        49        56        61        65        66        70        75 
##  712.1481 1004.9102  378.9617  529.3614  242.8610  208.0152  389.4522  486.4497 
##        77        83        89       103       106       115       118       119 
## 1055.1562  169.8317  423.3137  554.8638  449.3499  987.6462  959.6334 1026.8300 
##       124       125       129       143       146       148       152       153 
##  515.8917  851.5453 1045.1617  958.3169 1233.5784 1077.2899 1412.3263 1411.5915 
##       175       188       194       197       202       222       226       229 
## 1286.7249 1351.3593 1389.2282 1264.1748 1731.3657 1471.9845 1033.5325 1376.9934 
##       231       232       235       239       245       246       257       259 
## 1185.6996 1291.3801 1340.5224  945.2203 1126.0211 1180.0959 1177.7467  887.5257 
##       263       271       279       282       287       292       308       309 
##  834.8591  922.4971  924.2517  996.7429  907.9470  706.8062  651.9029  579.4046 
##       311       313       316       329       330       342       346       350 
##  689.6318  711.7987  630.5753  699.1712  705.8719  318.7937  451.5701  622.9880 
##       351       358       361       362       364       371       372       374 
##  312.1130  496.7320  407.4262  388.1067  524.2565  629.1341  751.2662  334.8691 
##       376       378       391       393       402       403       415       419 
##  272.1230  243.6344  566.2943  569.6546  473.6292  732.9187  375.8978  851.3101 
##       424       430       436       440       441       451       462       464 
##  744.8480  351.6433  682.7586 1066.9067  671.4510  769.3689  775.0461 1165.9782 
##       466       473       483       484       490       495       499       505 
##  883.3527 1318.9599  893.7604  783.9821 1071.0794  950.1047 1150.6803 1283.2644 
##       506       508       515       516       517       521       525       530 
## 1151.3453 1054.3862 1216.6548 1181.3739 1326.2549 1172.6442 1269.6690 1111.7748 
##       537       539       541       548       549       554       558       559 
## 1517.0480 1503.8453 1454.1689 1609.0450 1503.8826 1783.1232 1272.1632 1374.0532 
##       565       572       573       577       578       585       591       594 
## 1593.6789 1410.8608 1477.4539 1358.6530 1260.7585 1398.0107 1391.4775 1381.2275 
##       595       611       620       621       622       628       637       644 
## -306.7573 1209.9140 1208.5876 1170.7994 1176.2809 1092.3056 1096.7409 1167.2476 
##       646       649       662       676       680       684       687       690 
##  719.1923  943.0914 1062.7687  448.5278  758.0596  432.1917  577.0517  678.9244 
##       693       696       713       715       717       722       728       731 
##  725.9984  736.6066  540.5886  591.4063  551.6681  235.4025  375.9131  312.6227

Yes, the prediction equation derived from the linear regression model for ‘casual’ bike rentals can be used to predict the number of casual bike rentals with the test data. By plugging in the values of the predictor variables (temperature, apparent temperature, humidity, windspeed) from the test data into the prediction equation, we can generate predictions for the number of casual bike rentals.

The linear regression model leverages the relationships between the predictor variables and the outcome variable ‘casual’ to make accurate predictions based on the test data. By applying the prediction equation to the test data, we can estimate the number of casual bike rentals under different weather conditions and other factors.

BU7154_Individual_Assignment

Megan Downing

2024-10-18