library(readr)
data<- read_csv("C:/Users/PMLS/Downloads/bikes.csv",show_col_types = FALSE)
head(data)
## # A tibble: 6 × 10
## date season holiday weekday weather temperature realfeel humidity
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2011-01-01 1 0 6 2 46.7 46.4 0.806
## 2 2011-01-02 1 0 0 2 48.4 45.2 0.696
## 3 2011-01-03 1 0 1 1 34.2 25.7 0.437
## 4 2011-01-04 1 0 2 1 34.5 28.4 0.590
## 5 2011-01-05 1 0 3 1 36.8 30.4 0.437
## 6 2011-01-06 1 0 4 1 34.9 30.9 0.518
## # ℹ 2 more variables: windspeed <dbl>, rentals <dbl>
correlations <- cor(data[, c("temperature", "realfeel", "humidity", "windspeed", "rentals")])
print(correlations)
## temperature realfeel humidity windspeed rentals
## temperature 1.0000000 0.9917016 0.1269629 -0.1579441 0.6274940
## realfeel 0.9917016 1.0000000 0.1399881 -0.1836430 0.6310657
## humidity 0.1269629 0.1399881 1.0000000 -0.2484891 -0.1006586
## windspeed -0.1579441 -0.1836430 -0.2484891 1.0000000 -0.2345450
## rentals 0.6274940 0.6310657 -0.1006586 -0.2345450 1.0000000
Interpret the magnitude of the coefficient (Pearson, Spearman, or Kendall’s tau):
Closer to 1: Stronger relationship (positive or negative).
Closer to 0: Weaker relationship or no linear relationship.
R-squared
RMSE or MAE
plot(data$rentals ~ data$temperature)
plot(data$rentals ~ data$humidity)
plot(data$rentals ~ data$realfeel)
plot(data$rentals ~ data$windspeed)
model <- lm(rentals ~ temperature + realfeel + humidity + windspeed, data = data)
summary(model)
##
## Call:
## lm(formula = rentals ~ temperature + realfeel + humidity + windspeed,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4855 -1046 -79 1055 3564
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3282.60 467.00 7.029 4.80e-12 ***
## temperature 24.96 26.98 0.925 0.3551
## realfeel 43.26 21.69 1.994 0.0465 *
## humidity -3149.11 383.99 -8.201 1.08e-15 ***
## windspeed -108.78 17.32 -6.280 5.82e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1422 on 726 degrees of freedom
## Multiple R-squared: 0.4638, Adjusted R-squared: 0.4609
## F-statistic: 157 on 4 and 726 DF, p-value: < 2.2e-16
# Reading data from csv file
library(readr)
data1<- read_csv("C:/Users/PMLS/Downloads/bikes.csv",show_col_types = FALSE)
data <- as.data.frame(data1)
str(data)
## 'data.frame': 731 obs. of 10 variables:
## $ date : Date, format: "2011-01-01" "2011-01-02" ...
## $ season : num 1 1 1 1 1 1 1 1 1 1 ...
## $ holiday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ weekday : num 6 0 1 2 3 4 5 6 0 1 ...
## $ weather : num 2 2 1 1 1 1 2 2 1 1 ...
## $ temperature: num 46.7 48.4 34.2 34.5 36.8 ...
## $ realfeel : num 46.4 45.2 25.7 28.4 30.4 ...
## $ humidity : num 0.806 0.696 0.437 0.59 0.437 ...
## $ windspeed : num 6.68 10.35 10.34 6.67 7.78 ...
## $ rentals : num 985 801 1349 1562 1600 ...
summary(data)
## date season holiday weekday
## Min. :2011-01-01 Min. :1.000 Min. :0.00000 Min. :0.000
## 1st Qu.:2011-07-02 1st Qu.:2.000 1st Qu.:0.00000 1st Qu.:1.000
## Median :2012-01-01 Median :3.000 Median :0.00000 Median :3.000
## Mean :2012-01-01 Mean :2.497 Mean :0.02873 Mean :2.997
## 3rd Qu.:2012-07-01 3rd Qu.:3.000 3rd Qu.:0.00000 3rd Qu.:5.000
## Max. :2012-12-31 Max. :4.000 Max. :1.00000 Max. :6.000
## weather temperature realfeel humidity
## Min. :1.000 Min. :22.60 Min. : 12.59 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:46.12 1st Qu.: 43.34 1st Qu.:0.5200
## Median :1.000 Median :59.76 Median : 61.02 Median :0.6267
## Mean :1.395 Mean :59.51 Mean : 59.55 Mean :0.6279
## 3rd Qu.:2.000 3rd Qu.:73.05 3rd Qu.: 75.50 3rd Qu.:0.7302
## Max. :3.000 Max. :90.50 Max. :103.10 Max. :0.9725
## windspeed rentals
## Min. : 0.9322 Min. : 22
## 1st Qu.: 5.6182 1st Qu.:3152
## Median : 7.5343 Median :4548
## Mean : 7.9303 Mean :4504
## 3rd Qu.: 9.7092 3rd Qu.:5956
## Max. :21.1266 Max. :8714
# Handle missing values (if any)(impute or remove)
# Create additional features (if necessary)(combine variables, transform data)
model <- lm(rentals ~ temperature + realfeel + humidity + windspeed, data = data)
# Splitting the data 70:30
set.seed(123)
train_index <- sample(1:nrow(data), size = nrow(data) * 0.7)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
model <- lm(rentals ~ temperature + realfeel + humidity + windspeed, data = train_data)
predictions <- predict(model, newdata = test_data)
# Prediction accuracy
rmse <- sqrt(mean((predictions - test_data$rentals)^2))
print(paste0("RMSE:", rmse))
## [1] "RMSE:1474.5284010275"