This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
bike <- read.csv('D:\\dataset\\bike.csv')
grouped_data_1 <- bike %>%
group_by(Seasons) %>%
summarize(Mean_Value = mean(Rented.Bike.Count))
print(grouped_data_1)
## # A tibble: 4 × 2
## Seasons Mean_Value
## <chr> <dbl>
## 1 Autumn 820.
## 2 Spring 730.
## 3 Summer 1034.
## 4 Winter 226.
ggplot(grouped_data_1, aes(Seasons, Mean_Value)) +
geom_col() +
labs(title = "Summarization Example", x = "Seasons", y = "Mean_value") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
grouped_data_2 <- bike %>%
group_by(Seasons) %>%
summarize(sd_Value = sd(Wind.speed))
print(grouped_data_2)
## # A tibble: 4 × 2
## Seasons sd_Value
## <chr> <dbl>
## 1 Autumn 0.924
## 2 Spring 1.07
## 3 Summer 0.914
## 4 Winter 1.16
ggplot(grouped_data_2, aes(x = Seasons, y = sd_Value)) +
geom_point() +
labs(title = "Scatterplot Example", x = "SEASONS", y = "STANDARD DEVIATION")
grouped_data_3 <- bike %>%
group_by(Holiday) %>%
summarize(sum_Value = sum(Visibility))
print(grouped_data_3)
## # A tibble: 2 × 2
## Holiday sum_Value
## <chr> <int>
## 1 Holiday 657366
## 2 No Holiday 11929228
ggplot(grouped_data_3, aes(x = Holiday, y = 1, fill = sum_Value)) +
geom_tile() +
labs(title = "Heatmap of Numerical vs. Categorical", x = "Holiday", y = "sumvalues") +
scale_fill_gradient(low = "black", high = "blue") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
###expected probability
grouped_data_1 <- grouped_data_1 %>%
mutate(Probability1 = Mean_Value / sum(Mean_Value))
print(grouped_data_1)
## # A tibble: 4 × 3
## Seasons Mean_Value Probability1
## <chr> <dbl> <dbl>
## 1 Autumn 820. 0.292
## 2 Spring 730. 0.260
## 3 Summer 1034. 0.368
## 4 Winter 226. 0.0803
grouped_data_2 <- grouped_data_2 %>%
mutate(Probability2 = sd_Value / sum(sd_Value))
print(grouped_data_2)
## # A tibble: 4 × 3
## Seasons sd_Value Probability2
## <chr> <dbl> <dbl>
## 1 Autumn 0.924 0.227
## 2 Spring 1.07 0.263
## 3 Summer 0.914 0.225
## 4 Winter 1.16 0.284
grouped_data_3 <- grouped_data_3 %>%
mutate(Probability3 = sum_Value / sum(sum_Value))
print(grouped_data_3)
## # A tibble: 2 × 3
## Holiday sum_Value Probability3
## <chr> <int> <dbl>
## 1 Holiday 657366 0.0522
## 2 No Holiday 11929228 0.948
anomoly1 <- grouped_data_1 %>%
filter(Probability1 == min(Probability1)) %>%
pull(Seasons)
print(anomoly1)
## [1] "Winter"
bike <- bike %>%
mutate(anomoly1 = ifelse( Seasons == anomoly1, "Anomaly","Normal"))
print(head(bike,10))
## Date Rented.Bike.Count Hour Temperature Humidity Wind.speed Visibility
## 1 1/12/2017 254 0 -5.2 37 2.2 2000
## 2 1/12/2017 204 1 -5.5 38 0.8 2000
## 3 1/12/2017 173 2 -6.0 39 1.0 2000
## 4 1/12/2017 107 3 -6.2 40 0.9 2000
## 5 1/12/2017 78 4 -6.0 36 2.3 2000
## 6 1/12/2017 100 5 -6.4 37 1.5 2000
## 7 1/12/2017 181 6 -6.6 35 1.3 2000
## 8 1/12/2017 460 7 -7.4 38 0.9 2000
## 9 1/12/2017 930 8 -7.6 37 1.1 2000
## 10 1/12/2017 490 9 -6.5 27 0.5 1928
## Dew.point.temperature Solar.Radiation Rainfall Snowfall Seasons Holiday
## 1 -17.6 0.00 0 0 Winter No Holiday
## 2 -17.6 0.00 0 0 Winter No Holiday
## 3 -17.7 0.00 0 0 Winter No Holiday
## 4 -17.6 0.00 0 0 Winter No Holiday
## 5 -18.6 0.00 0 0 Winter No Holiday
## 6 -18.7 0.00 0 0 Winter No Holiday
## 7 -19.5 0.00 0 0 Winter No Holiday
## 8 -19.3 0.00 0 0 Winter No Holiday
## 9 -19.8 0.01 0 0 Winter No Holiday
## 10 -22.4 0.23 0 0 Winter No Holiday
## Functioning.Day anomoly1
## 1 Yes Anomaly
## 2 Yes Anomaly
## 3 Yes Anomaly
## 4 Yes Anomaly
## 5 Yes Anomaly
## 6 Yes Anomaly
## 7 Yes Anomaly
## 8 Yes Anomaly
## 9 Yes Anomaly
## 10 Yes Anomaly
anomoly2 <- grouped_data_3 %>%
filter(Probability3 == min(Probability3)) %>%
pull(Holiday)
print(anomoly2)
## [1] "Holiday"
bike <- bike %>%
mutate(anomoly2 = ifelse( Seasons == anomoly2, "Anomaly","Normal"))
print(head(bike,10))
## Date Rented.Bike.Count Hour Temperature Humidity Wind.speed Visibility
## 1 1/12/2017 254 0 -5.2 37 2.2 2000
## 2 1/12/2017 204 1 -5.5 38 0.8 2000
## 3 1/12/2017 173 2 -6.0 39 1.0 2000
## 4 1/12/2017 107 3 -6.2 40 0.9 2000
## 5 1/12/2017 78 4 -6.0 36 2.3 2000
## 6 1/12/2017 100 5 -6.4 37 1.5 2000
## 7 1/12/2017 181 6 -6.6 35 1.3 2000
## 8 1/12/2017 460 7 -7.4 38 0.9 2000
## 9 1/12/2017 930 8 -7.6 37 1.1 2000
## 10 1/12/2017 490 9 -6.5 27 0.5 1928
## Dew.point.temperature Solar.Radiation Rainfall Snowfall Seasons Holiday
## 1 -17.6 0.00 0 0 Winter No Holiday
## 2 -17.6 0.00 0 0 Winter No Holiday
## 3 -17.7 0.00 0 0 Winter No Holiday
## 4 -17.6 0.00 0 0 Winter No Holiday
## 5 -18.6 0.00 0 0 Winter No Holiday
## 6 -18.7 0.00 0 0 Winter No Holiday
## 7 -19.5 0.00 0 0 Winter No Holiday
## 8 -19.3 0.00 0 0 Winter No Holiday
## 9 -19.8 0.01 0 0 Winter No Holiday
## 10 -22.4 0.23 0 0 Winter No Holiday
## Functioning.Day anomoly1 anomoly2
## 1 Yes Anomaly Normal
## 2 Yes Anomaly Normal
## 3 Yes Anomaly Normal
## 4 Yes Anomaly Normal
## 5 Yes Anomaly Normal
## 6 Yes Anomaly Normal
## 7 Yes Anomaly Normal
## 8 Yes Anomaly Normal
## 9 Yes Anomaly Normal
## 10 Yes Anomaly Normal
bike_short <- head(bike, 100)
combination_counts1 <- bike %>%
group_by(Seasons, Humidity) %>%
summarise(count = n(), .groups = 'drop')
print(head(combination_counts1,10))
## # A tibble: 10 × 3
## Seasons Humidity count
## <chr> <int> <int>
## 1 Autumn 13 1
## 2 Autumn 14 2
## 3 Autumn 15 1
## 4 Autumn 16 4
## 5 Autumn 17 2
## 6 Autumn 18 2
## 7 Autumn 19 1
## 8 Autumn 20 5
## 9 Autumn 21 3
## 10 Autumn 22 2
ggplot(combination_counts1, aes(x = combination_counts1$Seasons, y = combination_counts1$count)) + geom_bar(stat = "identity", position = position_dodge(width = 0.8)) +
facet_wrap(~combination_counts1$Humidity) +
labs(
x = "Season",
y = "Count",
fill = "Humidity",
title = "Grouped Barplot for Season and count with respect to humidity"
) +
theme_minimal() + scale_fill_discrete() + theme(legend.position="top")
## Warning: Use of `combination_counts1$Seasons` is discouraged.
## ℹ Use `Seasons` instead.
## Warning: Use of `combination_counts1$count` is discouraged.
## ℹ Use `count` instead.
combination_counts2 <- bike %>%
group_by(Seasons, Solar.Radiation) %>%
summarise(count = n(), .groups = 'drop')
print(head(combination_counts2,10))
## # A tibble: 10 × 3
## Seasons Solar.Radiation count
## <chr> <dbl> <int>
## 1 Autumn 0 1143
## 2 Autumn 0.01 28
## 3 Autumn 0.02 20
## 4 Autumn 0.03 24
## 5 Autumn 0.04 8
## 6 Autumn 0.05 13
## 7 Autumn 0.06 11
## 8 Autumn 0.07 11
## 9 Autumn 0.08 6
## 10 Autumn 0.09 6
combination_counts3 <- bike %>%
group_by(Seasons, Rented.Bike.Count) %>%
summarise(count = n(), .groups = 'drop')
print(head(combination_counts3,10))
## # A tibble: 10 × 3
## Seasons Rented.Bike.Count count
## <chr> <int> <int>
## 1 Autumn 0 247
## 2 Autumn 2 1
## 3 Autumn 4 1
## 4 Autumn 5 2
## 5 Autumn 6 1
## 6 Autumn 8 1
## 7 Autumn 12 1
## 8 Autumn 14 2
## 9 Autumn 17 3
## 10 Autumn 18 2
combination_counts4 <- bike %>%
group_by(Seasons, Wind.speed) %>%
summarise(count = n(), .groups = 'drop')
print(head(combination_counts4,10))
## # A tibble: 10 × 3
## Seasons Wind.speed count
## <chr> <dbl> <int>
## 1 Autumn 0 35
## 2 Autumn 0.1 20
## 3 Autumn 0.2 41
## 4 Autumn 0.3 49
## 5 Autumn 0.4 52
## 6 Autumn 0.5 93
## 7 Autumn 0.6 89
## 8 Autumn 0.7 81
## 9 Autumn 0.8 90
## 10 Autumn 0.9 98
combination_counts5 <- bike %>%
group_by(Holiday, Humidity) %>%
summarise(count = n(), .groups = 'drop')
print(head(combination_counts5,10))
## # A tibble: 10 × 3
## Holiday Humidity count
## <chr> <int> <int>
## 1 Holiday 0 4
## 2 Holiday 14 2
## 3 Holiday 15 3
## 4 Holiday 16 1
## 5 Holiday 17 6
## 6 Holiday 18 3
## 7 Holiday 19 2
## 8 Holiday 20 2
## 9 Holiday 21 2
## 10 Holiday 22 2
ggplot(combination_counts5, aes(x = combination_counts5$Holiday, y = combination_counts5$count)) + geom_bar(stat = "identity", position = position_dodge(width = 0.8)) +
facet_wrap(~combination_counts5$Humidity) +
labs(
x = "holiday",
y = "Count",
fill = "Humidity",
title = "Grouped Barplot for holiday and count with respect to humidity"
) +
theme_minimal() + scale_fill_discrete() + theme(legend.position="top")
## Warning: Use of `combination_counts5$Holiday` is discouraged.
## ℹ Use `Holiday` instead.
## Warning: Use of `combination_counts5$count` is discouraged.
## ℹ Use `count` instead.
combination_counts6 <- bike %>%
group_by(Holiday, Rented.Bike.Count) %>%
summarise(count = n(), .groups = 'drop')
print(head(combination_counts6,10))
## # A tibble: 10 × 3
## Holiday Rented.Bike.Count count
## <chr> <int> <int>
## 1 Holiday 0 24
## 2 Holiday 3 2
## 3 Holiday 4 3
## 4 Holiday 7 1
## 5 Holiday 9 1
## 6 Holiday 11 1
## 7 Holiday 12 1
## 8 Holiday 14 1
## 9 Holiday 18 1
## 10 Holiday 20 1
combination_counts7 <- bike %>%
group_by(Functioning.Day, Rented.Bike.Count) %>%
summarise(count = n(), .groups = 'drop')
print(head(combination_counts7,10))
## # A tibble: 10 × 3
## Functioning.Day Rented.Bike.Count count
## <chr> <int> <int>
## 1 No 0 295
## 2 Yes 2 3
## 3 Yes 3 2
## 4 Yes 4 5
## 5 Yes 5 3
## 6 Yes 6 3
## 7 Yes 7 4
## 8 Yes 8 7
## 9 Yes 9 12
## 10 Yes 10 7
1.THE COMBINATION APPROACH FAILED TO PRODUCE MEANINGFUL RESULTS WHEN APPLIED TO THE DATE VARIABLE. IT IS IMPORTANT TO NOTE THAT, ACCORDING TO THE DATASET, THIS DATA ATTRIBUTE IS CONSIDERED NON-CATEGORICAL. 2.IT’S CRUCIAL TO EMPHASIZE THAT ALL VALUES FOR RENTED BIKE COUNTS ON NON-FUNCTIONING DAYS ARE CONSISTENTLY ZERO. AS A RESULT, WHEN VISUALIZING THIS DATA, THE PLOTTED REPRESENTATION TENDS TO DIMINISH ON THESE NON-FUNCTIONING DAYS, POTENTIALLY LEADING TO MISLEADING INTERPRETATIONS. 3.WHEN DEALING WITH A NUMERICAL VARIABLE EXHIBITING A SIGNIFICANTLY SKEWED OR NON-NORMALLY DISTRIBUTED PATTERN, IT CAN SIGNIFICANTLY IMPACT THE WAY WE INTERPRET RELATIONSHIPS BETWEEN VARIABLES. THIS SAME ISSUE ARISES IN THE CONTEXT OF THE SOLAR RADIATION COLUMN CONCERNING DIFFERENT SEASONS, AFFECTING OUR ABILITY TO ACCURATELY REPRESENT AND UNDERSTAND THE RELATIONSHIPS BETWEEN VARIABLES THROUGH PLOTS.
the echo = FALSE parameter was added to the code chunk
to prevent printing of the R code that generated the plot.