This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
data <- read.csv('D:/dataset/db1bike.csv')
# Explore the structure of your dataset
str(data)
## 'data.frame': 199 obs. of 14 variables:
## $ Date : chr "01-12-2017" "01-12-2017" "01-12-2017" "01-12-2017" ...
## $ Rented_Bike_Count : int 254 204 173 107 78 100 181 460 930 490 ...
## $ Hour : int 0 1 2 3 4 5 6 7 8 9 ...
## $ Temperature : num -5.2 -5.5 -6 -6.2 -6 -6.4 -6.6 -7.4 -7.6 -6.5 ...
## $ Humidity : int 37 38 39 40 36 37 35 38 37 27 ...
## $ Wind_speed : num 2.2 0.8 1 0.9 2.3 1.5 1.3 0.9 1.1 0.5 ...
## $ Visibility : int 2000 2000 2000 2000 2000 2000 2000 2000 2000 1928 ...
## $ Dew.point.temperature: num -17.6 -17.6 -17.7 -17.6 -18.6 -18.7 -19.5 -19.3 -19.8 -22.4 ...
## $ Solar.Radiation : num 0 0 0 0 0 0 0 0 0.01 0.23 ...
## $ Rainfall : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Snowfall : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Seasons : chr "Winter" "Winter" "Winter" "Winter" ...
## $ Holiday : chr "No Holiday" "No Holiday" "No Holiday" "No Holiday" ...
## $ Functioning.Day : chr "Yes" "Yes" "Yes" "Yes" ...
# Summary statistics
summary(data)
## Date Rented_Bike_Count Hour Temperature
## Length:199 Min. : 13.0 Min. : 0.0 Min. :-8.100
## Class :character 1st Qu.:191.5 1st Qu.: 5.0 1st Qu.:-3.700
## Mode :character Median :341.0 Median :11.0 Median :-0.800
## Mean :335.5 Mean :11.2 Mean :-0.601
## 3rd Qu.:430.5 3rd Qu.:17.0 3rd Qu.: 3.050
## Max. :937.0 Max. :23.0 Max. : 8.000
## Humidity Wind_speed Visibility Dew.point.temperature
## Min. :21.00 Min. :0.000 Min. : 66.0 Min. :-22.40
## 1st Qu.:37.00 1st Qu.:0.800 1st Qu.: 873.5 1st Qu.:-15.60
## Median :51.00 Median :1.500 Median :1808.0 Median : -7.90
## Mean :57.02 Mean :1.673 Mean :1412.0 Mean : -8.95
## 3rd Qu.:79.00 3rd Qu.:2.450 3rd Qu.:2000.0 3rd Qu.: -3.10
## Max. :96.00 Max. :5.800 Max. :2000.0 Max. : 3.80
## Solar.Radiation Rainfall Snowfall Seasons
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Length:199
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 Class :character
## Median :0.0000 Median :0.00000 Median :0.00000 Mode :character
## Mean :0.2103 Mean :0.02714 Mean :0.09548
## 3rd Qu.:0.2350 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.2200 Max. :2.50000 Max. :1.00000
## Holiday Functioning.Day
## Length:199 Length:199
## Class :character Class :character
## Mode :character Mode :character
##
##
##
# Check for missing values
# Example: If your dataset is named 'data'
colSums(is.na(data))
## Date Rented_Bike_Count Hour
## 0 0 0
## Temperature Humidity Wind_speed
## 0 0 0
## Visibility Dew.point.temperature Solar.Radiation
## 0 0 0
## Rainfall Snowfall Seasons
## 0 0 0
## Holiday Functioning.Day
## 0 0
# Summary statistics
summary(data)
## Date Rented_Bike_Count Hour Temperature
## Length:199 Min. : 13.0 Min. : 0.0 Min. :-8.100
## Class :character 1st Qu.:191.5 1st Qu.: 5.0 1st Qu.:-3.700
## Mode :character Median :341.0 Median :11.0 Median :-0.800
## Mean :335.5 Mean :11.2 Mean :-0.601
## 3rd Qu.:430.5 3rd Qu.:17.0 3rd Qu.: 3.050
## Max. :937.0 Max. :23.0 Max. : 8.000
## Humidity Wind_speed Visibility Dew.point.temperature
## Min. :21.00 Min. :0.000 Min. : 66.0 Min. :-22.40
## 1st Qu.:37.00 1st Qu.:0.800 1st Qu.: 873.5 1st Qu.:-15.60
## Median :51.00 Median :1.500 Median :1808.0 Median : -7.90
## Mean :57.02 Mean :1.673 Mean :1412.0 Mean : -8.95
## 3rd Qu.:79.00 3rd Qu.:2.450 3rd Qu.:2000.0 3rd Qu.: -3.10
## Max. :96.00 Max. :5.800 Max. :2000.0 Max. : 3.80
## Solar.Radiation Rainfall Snowfall Seasons
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Length:199
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 Class :character
## Median :0.0000 Median :0.00000 Median :0.00000 Mode :character
## Mean :0.2103 Mean :0.02714 Mean :0.09548
## 3rd Qu.:0.2350 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.2200 Max. :2.50000 Max. :1.00000
## Holiday Functioning.Day
## Length:199 Length:199
## Class :character Class :character
## Mode :character Mode :character
##
##
##
# Visualize data
hist(data$Hour)
boxplot(data$Hour ~ data$Rented_Bike_Count)
# Correlation analysis
cor.test(data$Hour, data$Rented_Bike_Count)
##
## Pearson's product-moment correlation
##
## data: data$Hour and data$Rented_Bike_Count
## t = 7.7106, df = 197, p-value = 6.056e-13
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3669733 0.5816255
## sample estimates:
## cor
## 0.481487
# Linear regression
fit <- lm(Hour ~ Rented_Bike_Count + Temperature, data = data)
summary(fit) # view model results
##
## Call:
## lm(formula = Hour ~ Rented_Bike_Count + Temperature, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.986 -3.572 -0.707 3.070 15.122
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.953089 0.881600 6.753 1.61e-10 ***
## Rented_Bike_Count 0.016295 0.002262 7.205 1.22e-11 ***
## Temperature 0.364873 0.106421 3.429 0.00074 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.994 on 196 degrees of freedom
## Multiple R-squared: 0.2753, Adjusted R-squared: 0.2679
## F-statistic: 37.23 on 2 and 196 DF, p-value: 1.976e-14
confint(fit) # 95% CIs for model coefficients
## 2.5 % 97.5 %
## (Intercept) 4.2144502 7.69172869
## Rented_Bike_Count 0.0118348 0.02075537
## Temperature 0.1549948 0.57475081
# ANOVA
fit2 <- aov(Rented_Bike_Count ~ Temperature, data = data)
summary(fit2)
## Df Sum Sq Mean Sq F value Pr(>F)
## Temperature 1 217110 217110 6.09 0.0144 *
## Residuals 197 7023320 35651
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Kmeans clustering
set.seed(123)
clusters <- kmeans(data[, 5:7], 3)
# Aggregate by cluster
aggregate(Temperature ~ cluster,
data=cbind(cluster=clusters$cluster, data), # bind clusters to data
FUN=mean)
## cluster Temperature
## 1 1 -0.3707317
## 2 2 2.9255319
## 3 3 -2.1792793