R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
bike <- read.csv('D:\\dataset\\bike.csv')

loading dplyr package for group_by

PART 1

grouped_data_1 <- bike %>%
  group_by(Seasons)  %>%
  summarize(Mean_Value = mean(Rented.Bike.Count))
 
 print(grouped_data_1)
## # A tibble: 4 × 2
##   Seasons Mean_Value
##   <chr>        <dbl>
## 1 Autumn        820.
## 2 Spring        730.
## 3 Summer       1034.
## 4 Winter        226.

Load the ggplot2 library if it’s not already loaded

Create a bar plot

ggplot(grouped_data_1, aes(Seasons, Mean_Value)) +
  geom_col() +
  labs(title = "Summarization Example", x = "Seasons", y = "Mean_value") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

grouped_data_2 <- bike %>%
  group_by(Seasons)  %>%
  summarize(sd_Value = sd(Wind.speed))
 
 print(grouped_data_2)
## # A tibble: 4 × 2
##   Seasons sd_Value
##   <chr>      <dbl>
## 1 Autumn     0.924
## 2 Spring     1.07 
## 3 Summer     0.914
## 4 Winter     1.16
ggplot(grouped_data_2, aes(x = Seasons, y = sd_Value)) +
  geom_point() +
  labs(title = "Scatterplot Example", x = "SEASONS", y = "STANDARD DEVIATION")

grouped_data_3 <- bike %>%
  group_by(Holiday)  %>%
  summarize(sum_Value = sum(Visibility))
 
 print(grouped_data_3)
## # A tibble: 2 × 2
##   Holiday    sum_Value
##   <chr>          <int>
## 1 Holiday       657366
## 2 No Holiday  11929228
ggplot(grouped_data_3, aes(x = Holiday, y = 1, fill = sum_Value)) +
  geom_tile() +
  labs(title = "Heatmap of Numerical vs. Categorical", x = "Holiday", y = "sumvalues") +
  scale_fill_gradient(low = "black", high = "blue") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

###expected probability

grouped_data_1 <- grouped_data_1 %>%
  mutate(Probability1 = Mean_Value / sum(Mean_Value))
print(grouped_data_1)
## # A tibble: 4 × 3
##   Seasons Mean_Value Probability1
##   <chr>        <dbl>        <dbl>
## 1 Autumn        820.       0.292 
## 2 Spring        730.       0.260 
## 3 Summer       1034.       0.368 
## 4 Winter        226.       0.0803
grouped_data_2 <- grouped_data_2 %>%
  mutate(Probability2 = sd_Value / sum(sd_Value))
print(grouped_data_2) 
## # A tibble: 4 × 3
##   Seasons sd_Value Probability2
##   <chr>      <dbl>        <dbl>
## 1 Autumn     0.924        0.227
## 2 Spring     1.07         0.263
## 3 Summer     0.914        0.225
## 4 Winter     1.16         0.284

Expected probability

grouped_data_3 <- grouped_data_3 %>%
  mutate(Probability3 = sum_Value / sum(sum_Value))
print(grouped_data_3) 
## # A tibble: 2 × 3
##   Holiday    sum_Value Probability3
##   <chr>          <int>        <dbl>
## 1 Holiday       657366       0.0522
## 2 No Holiday  11929228       0.948

Anamoly

anomoly1 <- grouped_data_1 %>%
  filter(Probability1 == min(Probability1)) %>%
  pull(Seasons)
print(anomoly1)
## [1] "Winter"
bike <- bike %>%
  mutate(anomoly1 = ifelse( Seasons == anomoly1, "Anomaly","Normal"))
print(head(bike,10))
##         Date Rented.Bike.Count Hour Temperature Humidity Wind.speed Visibility
## 1  1/12/2017               254    0        -5.2       37        2.2       2000
## 2  1/12/2017               204    1        -5.5       38        0.8       2000
## 3  1/12/2017               173    2        -6.0       39        1.0       2000
## 4  1/12/2017               107    3        -6.2       40        0.9       2000
## 5  1/12/2017                78    4        -6.0       36        2.3       2000
## 6  1/12/2017               100    5        -6.4       37        1.5       2000
## 7  1/12/2017               181    6        -6.6       35        1.3       2000
## 8  1/12/2017               460    7        -7.4       38        0.9       2000
## 9  1/12/2017               930    8        -7.6       37        1.1       2000
## 10 1/12/2017               490    9        -6.5       27        0.5       1928
##    Dew.point.temperature Solar.Radiation Rainfall Snowfall Seasons    Holiday
## 1                  -17.6            0.00        0        0  Winter No Holiday
## 2                  -17.6            0.00        0        0  Winter No Holiday
## 3                  -17.7            0.00        0        0  Winter No Holiday
## 4                  -17.6            0.00        0        0  Winter No Holiday
## 5                  -18.6            0.00        0        0  Winter No Holiday
## 6                  -18.7            0.00        0        0  Winter No Holiday
## 7                  -19.5            0.00        0        0  Winter No Holiday
## 8                  -19.3            0.00        0        0  Winter No Holiday
## 9                  -19.8            0.01        0        0  Winter No Holiday
## 10                 -22.4            0.23        0        0  Winter No Holiday
##    Functioning.Day anomoly1
## 1              Yes  Anomaly
## 2              Yes  Anomaly
## 3              Yes  Anomaly
## 4              Yes  Anomaly
## 5              Yes  Anomaly
## 6              Yes  Anomaly
## 7              Yes  Anomaly
## 8              Yes  Anomaly
## 9              Yes  Anomaly
## 10             Yes  Anomaly
anomoly2 <- grouped_data_3 %>%
  filter(Probability3 == min(Probability3)) %>%
  pull(Holiday)
print(anomoly2)
## [1] "Holiday"
bike <- bike %>%
  mutate(anomoly2 = ifelse( Seasons == anomoly2, "Anomaly","Normal"))
print(head(bike,10))
##         Date Rented.Bike.Count Hour Temperature Humidity Wind.speed Visibility
## 1  1/12/2017               254    0        -5.2       37        2.2       2000
## 2  1/12/2017               204    1        -5.5       38        0.8       2000
## 3  1/12/2017               173    2        -6.0       39        1.0       2000
## 4  1/12/2017               107    3        -6.2       40        0.9       2000
## 5  1/12/2017                78    4        -6.0       36        2.3       2000
## 6  1/12/2017               100    5        -6.4       37        1.5       2000
## 7  1/12/2017               181    6        -6.6       35        1.3       2000
## 8  1/12/2017               460    7        -7.4       38        0.9       2000
## 9  1/12/2017               930    8        -7.6       37        1.1       2000
## 10 1/12/2017               490    9        -6.5       27        0.5       1928
##    Dew.point.temperature Solar.Radiation Rainfall Snowfall Seasons    Holiday
## 1                  -17.6            0.00        0        0  Winter No Holiday
## 2                  -17.6            0.00        0        0  Winter No Holiday
## 3                  -17.7            0.00        0        0  Winter No Holiday
## 4                  -17.6            0.00        0        0  Winter No Holiday
## 5                  -18.6            0.00        0        0  Winter No Holiday
## 6                  -18.7            0.00        0        0  Winter No Holiday
## 7                  -19.5            0.00        0        0  Winter No Holiday
## 8                  -19.3            0.00        0        0  Winter No Holiday
## 9                  -19.8            0.01        0        0  Winter No Holiday
## 10                 -22.4            0.23        0        0  Winter No Holiday
##    Functioning.Day anomoly1 anomoly2
## 1              Yes  Anomaly   Normal
## 2              Yes  Anomaly   Normal
## 3              Yes  Anomaly   Normal
## 4              Yes  Anomaly   Normal
## 5              Yes  Anomaly   Normal
## 6              Yes  Anomaly   Normal
## 7              Yes  Anomaly   Normal
## 8              Yes  Anomaly   Normal
## 9              Yes  Anomaly   Normal
## 10             Yes  Anomaly   Normal

PART II

COMBINATIONS

Calculating the frequency of each combination

bike_short <- head(bike, 100)


combination_counts1 <- bike %>%
  group_by(Seasons, Humidity) %>%
  summarise(count = n(), .groups = 'drop')
print(head(combination_counts1,10))
## # A tibble: 10 × 3
##    Seasons Humidity count
##    <chr>      <int> <int>
##  1 Autumn        13     1
##  2 Autumn        14     2
##  3 Autumn        15     1
##  4 Autumn        16     4
##  5 Autumn        17     2
##  6 Autumn        18     2
##  7 Autumn        19     1
##  8 Autumn        20     5
##  9 Autumn        21     3
## 10 Autumn        22     2
ggplot(combination_counts1, aes(x = combination_counts1$Seasons, y = combination_counts1$count)) + geom_bar(stat = "identity", position = position_dodge(width = 0.8)) +
  facet_wrap(~combination_counts1$Humidity) +
  labs(
    x = "Season",
    y = "Count",
    fill = "Humidity",
    title = "Grouped Barplot for Season and count with respect to humidity"
  ) +
  theme_minimal() + scale_fill_discrete() +  theme(legend.position="top")
## Warning: Use of `combination_counts1$Seasons` is discouraged.
## ℹ Use `Seasons` instead.
## Warning: Use of `combination_counts1$count` is discouraged.
## ℹ Use `count` instead.

combination_counts2 <- bike %>%
  group_by(Seasons, Solar.Radiation) %>%
  summarise(count = n(), .groups = 'drop')
print(head(combination_counts2,10))
## # A tibble: 10 × 3
##    Seasons Solar.Radiation count
##    <chr>             <dbl> <int>
##  1 Autumn             0     1143
##  2 Autumn             0.01    28
##  3 Autumn             0.02    20
##  4 Autumn             0.03    24
##  5 Autumn             0.04     8
##  6 Autumn             0.05    13
##  7 Autumn             0.06    11
##  8 Autumn             0.07    11
##  9 Autumn             0.08     6
## 10 Autumn             0.09     6
combination_counts3 <- bike %>%
  group_by(Seasons, Rented.Bike.Count) %>%
  summarise(count = n(), .groups = 'drop')
print(head(combination_counts3,10))
## # A tibble: 10 × 3
##    Seasons Rented.Bike.Count count
##    <chr>               <int> <int>
##  1 Autumn                  0   247
##  2 Autumn                  2     1
##  3 Autumn                  4     1
##  4 Autumn                  5     2
##  5 Autumn                  6     1
##  6 Autumn                  8     1
##  7 Autumn                 12     1
##  8 Autumn                 14     2
##  9 Autumn                 17     3
## 10 Autumn                 18     2
combination_counts4 <- bike %>%
  group_by(Seasons, Wind.speed) %>%
  summarise(count = n(), .groups = 'drop')
print(head(combination_counts4,10))
## # A tibble: 10 × 3
##    Seasons Wind.speed count
##    <chr>        <dbl> <int>
##  1 Autumn         0      35
##  2 Autumn         0.1    20
##  3 Autumn         0.2    41
##  4 Autumn         0.3    49
##  5 Autumn         0.4    52
##  6 Autumn         0.5    93
##  7 Autumn         0.6    89
##  8 Autumn         0.7    81
##  9 Autumn         0.8    90
## 10 Autumn         0.9    98
combination_counts5 <- bike %>%
  group_by(Holiday, Humidity) %>%
  summarise(count = n(), .groups = 'drop')
print(head(combination_counts5,10))
## # A tibble: 10 × 3
##    Holiday Humidity count
##    <chr>      <int> <int>
##  1 Holiday        0     4
##  2 Holiday       14     2
##  3 Holiday       15     3
##  4 Holiday       16     1
##  5 Holiday       17     6
##  6 Holiday       18     3
##  7 Holiday       19     2
##  8 Holiday       20     2
##  9 Holiday       21     2
## 10 Holiday       22     2
ggplot(combination_counts5, aes(x = combination_counts5$Holiday, y = combination_counts5$count)) + geom_bar(stat = "identity", position = position_dodge(width = 0.8)) +
  facet_wrap(~combination_counts5$Humidity) +
  labs(
    x = "holiday",
    y = "Count",
    fill = "Humidity",
    title = "Grouped Barplot for holiday and count with respect to humidity"
  ) +
  theme_minimal() + scale_fill_discrete() +  theme(legend.position="top")
## Warning: Use of `combination_counts5$Holiday` is discouraged.
## ℹ Use `Holiday` instead.
## Warning: Use of `combination_counts5$count` is discouraged.
## ℹ Use `count` instead.

combination_counts6 <- bike %>%
  group_by(Holiday, Rented.Bike.Count) %>%
  summarise(count = n(), .groups = 'drop')
print(head(combination_counts6,10))
## # A tibble: 10 × 3
##    Holiday Rented.Bike.Count count
##    <chr>               <int> <int>
##  1 Holiday                 0    24
##  2 Holiday                 3     2
##  3 Holiday                 4     3
##  4 Holiday                 7     1
##  5 Holiday                 9     1
##  6 Holiday                11     1
##  7 Holiday                12     1
##  8 Holiday                14     1
##  9 Holiday                18     1
## 10 Holiday                20     1
combination_counts7 <- bike %>%
  group_by(Functioning.Day, Rented.Bike.Count) %>%
  summarise(count = n(), .groups = 'drop')
print(head(combination_counts7,10))
## # A tibble: 10 × 3
##    Functioning.Day Rented.Bike.Count count
##    <chr>                       <int> <int>
##  1 No                              0   295
##  2 Yes                             2     3
##  3 Yes                             3     2
##  4 Yes                             4     5
##  5 Yes                             5     3
##  6 Yes                             6     3
##  7 Yes                             7     4
##  8 Yes                             8     7
##  9 Yes                             9    12
## 10 Yes                            10     7

Conclusion on combination

1.THE COMBINATION APPROACH FAILED TO PRODUCE MEANINGFUL RESULTS WHEN APPLIED TO THE DATE VARIABLE. IT IS IMPORTANT TO NOTE THAT, ACCORDING TO THE DATASET, THIS DATA ATTRIBUTE IS CONSIDERED NON-CATEGORICAL. 2.IT’S CRUCIAL TO EMPHASIZE THAT ALL VALUES FOR RENTED BIKE COUNTS ON NON-FUNCTIONING DAYS ARE CONSISTENTLY ZERO. AS A RESULT, WHEN VISUALIZING THIS DATA, THE PLOTTED REPRESENTATION TENDS TO DIMINISH ON THESE NON-FUNCTIONING DAYS, POTENTIALLY LEADING TO MISLEADING INTERPRETATIONS. 3.WHEN DEALING WITH A NUMERICAL VARIABLE EXHIBITING A SIGNIFICANTLY SKEWED OR NON-NORMALLY DISTRIBUTED PATTERN, IT CAN SIGNIFICANTLY IMPACT THE WAY WE INTERPRET RELATIONSHIPS BETWEEN VARIABLES. THIS SAME ISSUE ARISES IN THE CONTEXT OF THE SOLAR RADIATION COLUMN CONCERNING DIFFERENT SEASONS, AFFECTING OUR ABILITY TO ACCURATELY REPRESENT AND UNDERSTAND THE RELATIONSHIPS BETWEEN VARIABLES THROUGH PLOTS.

the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.