This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
bike <- read.csv('D:/FALL 2023/STATISTICS/datasets/bike.csv')
grouped_data_1 <- bike %>%
group_by(Seasons) %>%
summarize(Mean_Value = mean(Rented.Bike.Count))
print(grouped_data_1)
## # A tibble: 4 × 2
## Seasons Mean_Value
## <chr> <dbl>
## 1 Autumn 820.
## 2 Spring 730.
## 3 Summer 1034.
## 4 Winter 226.
ggplot(grouped_data_1, aes(Seasons, Mean_Value)) +
geom_col() +
labs(title = "Summarization Example", x = "Seasons", y = "Mean_value") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
grouped_data_2 <- bike %>%
group_by(Seasons) %>%
summarize(sd_Value = sd(Wind.speed))
print(grouped_data_2)
## # A tibble: 4 × 2
## Seasons sd_Value
## <chr> <dbl>
## 1 Autumn 0.924
## 2 Spring 1.07
## 3 Summer 0.914
## 4 Winter 1.16
ggplot(grouped_data_2, aes(x = Seasons, y = sd_Value)) +
geom_point() +
labs(title = "Scatterplot Example", x = "SEASONS", y = "STANDARD DEVIATION")
grouped_data_3 <- bike %>%
group_by(Holiday) %>%
summarize(sum_Value = sum(Visibility))
print(grouped_data_3)
## # A tibble: 2 × 2
## Holiday sum_Value
## <chr> <int>
## 1 Holiday 657366
## 2 No Holiday 11929228
ggplot(grouped_data_3, aes(x = Holiday, y = 1, fill = sum_Value)) +
geom_tile() +
labs(title = "Heatmap of Numerical vs. Categorical", x = "Holiday", y = "sumvalues") +
scale_fill_gradient(low = "black", high = "blue") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
###expected probability
grouped_data_1 <- grouped_data_1 %>%
mutate(Probability1 = Mean_Value / sum(Mean_Value))
print(grouped_data_1)
## # A tibble: 4 × 3
## Seasons Mean_Value Probability1
## <chr> <dbl> <dbl>
## 1 Autumn 820. 0.292
## 2 Spring 730. 0.260
## 3 Summer 1034. 0.368
## 4 Winter 226. 0.0803
grouped_data_2 <- grouped_data_2 %>%
mutate(Probability2 = sd_Value / sum(sd_Value))
print(grouped_data_2)
## # A tibble: 4 × 3
## Seasons sd_Value Probability2
## <chr> <dbl> <dbl>
## 1 Autumn 0.924 0.227
## 2 Spring 1.07 0.263
## 3 Summer 0.914 0.225
## 4 Winter 1.16 0.284
grouped_data_3 <- grouped_data_3 %>%
mutate(Probability3 = sum_Value / sum(sum_Value))
print(grouped_data_3)
## # A tibble: 2 × 3
## Holiday sum_Value Probability3
## <chr> <int> <dbl>
## 1 Holiday 657366 0.0522
## 2 No Holiday 11929228 0.948
anomoly1 <- grouped_data_1 %>%
filter(Probability1 == min(Probability1)) %>%
pull(Seasons)
print(anomoly1)
## [1] "Winter"
bike <- bike %>%
mutate(anomoly1 = ifelse( Seasons == anomoly1, "Anomaly","Normal"))
print(head(bike,10))
## Date Rented.Bike.Count Hour Temperature Humidity Wind.speed Visibility
## 1 1/12/2017 254 0 -5.2 37 2.2 2000
## 2 1/12/2017 204 1 -5.5 38 0.8 2000
## 3 1/12/2017 173 2 -6.0 39 1.0 2000
## 4 1/12/2017 107 3 -6.2 40 0.9 2000
## 5 1/12/2017 78 4 -6.0 36 2.3 2000
## 6 1/12/2017 100 5 -6.4 37 1.5 2000
## 7 1/12/2017 181 6 -6.6 35 1.3 2000
## 8 1/12/2017 460 7 -7.4 38 0.9 2000
## 9 1/12/2017 930 8 -7.6 37 1.1 2000
## 10 1/12/2017 490 9 -6.5 27 0.5 1928
## Dew.point.temperature Solar.Radiation Rainfall Snowfall Seasons Holiday
## 1 -17.6 0.00 0 0 Winter No Holiday
## 2 -17.6 0.00 0 0 Winter No Holiday
## 3 -17.7 0.00 0 0 Winter No Holiday
## 4 -17.6 0.00 0 0 Winter No Holiday
## 5 -18.6 0.00 0 0 Winter No Holiday
## 6 -18.7 0.00 0 0 Winter No Holiday
## 7 -19.5 0.00 0 0 Winter No Holiday
## 8 -19.3 0.00 0 0 Winter No Holiday
## 9 -19.8 0.01 0 0 Winter No Holiday
## 10 -22.4 0.23 0 0 Winter No Holiday
## Functioning.Day anomoly1
## 1 Yes Anomaly
## 2 Yes Anomaly
## 3 Yes Anomaly
## 4 Yes Anomaly
## 5 Yes Anomaly
## 6 Yes Anomaly
## 7 Yes Anomaly
## 8 Yes Anomaly
## 9 Yes Anomaly
## 10 Yes Anomaly
anomoly2 <- grouped_data_3 %>%
filter(Probability3 == min(Probability3)) %>%
pull(Holiday)
print(anomoly2)
## [1] "Holiday"
bike <- bike %>%
mutate(anomoly2 = ifelse( Seasons == anomoly2, "Anomaly","Normal"))
print(head(bike,10))
## Date Rented.Bike.Count Hour Temperature Humidity Wind.speed Visibility
## 1 1/12/2017 254 0 -5.2 37 2.2 2000
## 2 1/12/2017 204 1 -5.5 38 0.8 2000
## 3 1/12/2017 173 2 -6.0 39 1.0 2000
## 4 1/12/2017 107 3 -6.2 40 0.9 2000
## 5 1/12/2017 78 4 -6.0 36 2.3 2000
## 6 1/12/2017 100 5 -6.4 37 1.5 2000
## 7 1/12/2017 181 6 -6.6 35 1.3 2000
## 8 1/12/2017 460 7 -7.4 38 0.9 2000
## 9 1/12/2017 930 8 -7.6 37 1.1 2000
## 10 1/12/2017 490 9 -6.5 27 0.5 1928
## Dew.point.temperature Solar.Radiation Rainfall Snowfall Seasons Holiday
## 1 -17.6 0.00 0 0 Winter No Holiday
## 2 -17.6 0.00 0 0 Winter No Holiday
## 3 -17.7 0.00 0 0 Winter No Holiday
## 4 -17.6 0.00 0 0 Winter No Holiday
## 5 -18.6 0.00 0 0 Winter No Holiday
## 6 -18.7 0.00 0 0 Winter No Holiday
## 7 -19.5 0.00 0 0 Winter No Holiday
## 8 -19.3 0.00 0 0 Winter No Holiday
## 9 -19.8 0.01 0 0 Winter No Holiday
## 10 -22.4 0.23 0 0 Winter No Holiday
## Functioning.Day anomoly1 anomoly2
## 1 Yes Anomaly Normal
## 2 Yes Anomaly Normal
## 3 Yes Anomaly Normal
## 4 Yes Anomaly Normal
## 5 Yes Anomaly Normal
## 6 Yes Anomaly Normal
## 7 Yes Anomaly Normal
## 8 Yes Anomaly Normal
## 9 Yes Anomaly Normal
## 10 Yes Anomaly Normal
bike_short <- head(bike, 100)
combination_counts1 <- bike %>%
group_by(Seasons, Humidity) %>%
summarise(count = n(), .groups = 'drop')
print(head(combination_counts1,10))
## # A tibble: 10 × 3
## Seasons Humidity count
## <chr> <int> <int>
## 1 Autumn 13 1
## 2 Autumn 14 2
## 3 Autumn 15 1
## 4 Autumn 16 4
## 5 Autumn 17 2
## 6 Autumn 18 2
## 7 Autumn 19 1
## 8 Autumn 20 5
## 9 Autumn 21 3
## 10 Autumn 22 2
ggplot(combination_counts1, aes(x = combination_counts1$Seasons, y = combination_counts1$count)) + geom_bar(stat = "identity", position = position_dodge(width = 0.8)) +
facet_wrap(~combination_counts1$Humidity) +
labs(
x = "Season",
y = "Count",
fill = "Humidity",
title = "Grouped Barplot for Season and count with respect to humidity"
) +
theme_minimal() + scale_fill_discrete() + theme(legend.position="top")
## Warning: Use of `combination_counts1$Seasons` is discouraged.
## ℹ Use `Seasons` instead.
## Warning: Use of `combination_counts1$count` is discouraged.
## ℹ Use `count` instead.
combination_counts2 <- bike %>%
group_by(Seasons, Solar.Radiation) %>%
summarise(count = n(), .groups = 'drop')
print(head(combination_counts2,10))
## # A tibble: 10 × 3
## Seasons Solar.Radiation count
## <chr> <dbl> <int>
## 1 Autumn 0 1143
## 2 Autumn 0.01 28
## 3 Autumn 0.02 20
## 4 Autumn 0.03 24
## 5 Autumn 0.04 8
## 6 Autumn 0.05 13
## 7 Autumn 0.06 11
## 8 Autumn 0.07 11
## 9 Autumn 0.08 6
## 10 Autumn 0.09 6
combination_counts3 <- bike %>%
group_by(Seasons, Rented.Bike.Count) %>%
summarise(count = n(), .groups = 'drop')
print(head(combination_counts3,10))
## # A tibble: 10 × 3
## Seasons Rented.Bike.Count count
## <chr> <int> <int>
## 1 Autumn 0 247
## 2 Autumn 2 1
## 3 Autumn 4 1
## 4 Autumn 5 2
## 5 Autumn 6 1
## 6 Autumn 8 1
## 7 Autumn 12 1
## 8 Autumn 14 2
## 9 Autumn 17 3
## 10 Autumn 18 2
combination_counts4 <- bike %>%
group_by(Seasons, Wind.speed) %>%
summarise(count = n(), .groups = 'drop')
print(head(combination_counts4,10))
## # A tibble: 10 × 3
## Seasons Wind.speed count
## <chr> <dbl> <int>
## 1 Autumn 0 35
## 2 Autumn 0.1 20
## 3 Autumn 0.2 41
## 4 Autumn 0.3 49
## 5 Autumn 0.4 52
## 6 Autumn 0.5 93
## 7 Autumn 0.6 89
## 8 Autumn 0.7 81
## 9 Autumn 0.8 90
## 10 Autumn 0.9 98
combination_counts5 <- bike %>%
group_by(Holiday, Humidity) %>%
summarise(count = n(), .groups = 'drop')
print(head(combination_counts5,10))
## # A tibble: 10 × 3
## Holiday Humidity count
## <chr> <int> <int>
## 1 Holiday 0 4
## 2 Holiday 14 2
## 3 Holiday 15 3
## 4 Holiday 16 1
## 5 Holiday 17 6
## 6 Holiday 18 3
## 7 Holiday 19 2
## 8 Holiday 20 2
## 9 Holiday 21 2
## 10 Holiday 22 2
ggplot(combination_counts5, aes(x = combination_counts5$Holiday, y = combination_counts5$count)) + geom_bar(stat = "identity", position = position_dodge(width = 0.8)) +
facet_wrap(~combination_counts5$Humidity) +
labs(
x = "holiday",
y = "Count",
fill = "Humidity",
title = "Grouped Barplot for holiday and count with respect to humidity"
) +
theme_minimal() + scale_fill_discrete() + theme(legend.position="top")
## Warning: Use of `combination_counts5$Holiday` is discouraged.
## ℹ Use `Holiday` instead.
## Warning: Use of `combination_counts5$count` is discouraged.
## ℹ Use `count` instead.
combination_counts6 <- bike %>%
group_by(Holiday, Rented.Bike.Count) %>%
summarise(count = n(), .groups = 'drop')
print(head(combination_counts6,10))
## # A tibble: 10 × 3
## Holiday Rented.Bike.Count count
## <chr> <int> <int>
## 1 Holiday 0 24
## 2 Holiday 3 2
## 3 Holiday 4 3
## 4 Holiday 7 1
## 5 Holiday 9 1
## 6 Holiday 11 1
## 7 Holiday 12 1
## 8 Holiday 14 1
## 9 Holiday 18 1
## 10 Holiday 20 1
combination_counts7 <- bike %>%
group_by(Functioning.Day, Rented.Bike.Count) %>%
summarise(count = n(), .groups = 'drop')
print(head(combination_counts7,10))
## # A tibble: 10 × 3
## Functioning.Day Rented.Bike.Count count
## <chr> <int> <int>
## 1 No 0 295
## 2 Yes 2 3
## 3 Yes 3 2
## 4 Yes 4 5
## 5 Yes 5 3
## 6 Yes 6 3
## 7 Yes 7 4
## 8 Yes 8 7
## 9 Yes 9 12
## 10 Yes 10 7
1.COMBINATION DID NOT WORK WITH DATE. AS PER THE DATSET, THE DATA ATTRIBUTE IS CONSIDERED AS NON-CATEGORICAL VARIABLE. 2.THE VALUE OF RENTED.BIKE.COUNT CORRESPONDING TO NON-FUNCTIONING DAY ARE ALL ZEROES. WHILE PLOTTING, THE PLOT DIMINISHES FOR NON-FUNCTIONING DAY WHICH IS MISLEADING 3.IF THE NUMERICAL VARIABLE HAS AN EXTREMELY SKEWED OR NON-NORMAL DISTRIBUTION, IT CAN AFFECT THE INTERPRETATION OF THE RELATIONSHIP BETWEEN THE VARIABLES. SAME THING IS HAPPENING WITH SOLAR.RADIATION COLUMN WITH RESPECT TO SEASONS. IT IS AFFECTING THE RELATION BETWEEN VARIABLES AND PLOTTING.
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.