# Importing the packages and reading the data set
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
my_data <- read_delim("C:/Users/user/Documents/Statistics/Telangana_2018_complete_weather_data.csv",delim=",")
## Rows: 311157 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): District, Mandal, Location,  Date
## dbl (6): row_id, temp_min, temp_max, humidity_min, humidity_max, wind_speed
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
  grouped_data_1 <- my_data %>%
  group_by(District,Mandal,Location)%>% 
  summarise(temp_min,temp_max)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'District', 'Mandal', 'Location'. You can
## override using the `.groups` argument.
print(grouped_data_1)
## # A tibble: 311,157 × 5
## # Groups:   District, Mandal, Location [1,678]
##    District Mandal           Location temp_min temp_max
##    <chr>    <chr>            <chr>       <dbl>    <dbl>
##  1 Adilabad Adilabad (Urban) Adilabad     22.3     34.3
##  2 Adilabad Adilabad (Urban) Adilabad     22.4     35.1
##  3 Adilabad Adilabad (Urban) Adilabad     21.8     35.4
##  4 Adilabad Adilabad (Urban) Adilabad     21.5     36.1
##  5 Adilabad Adilabad (Urban) Adilabad     22.3     35  
##  6 Adilabad Adilabad (Urban) Adilabad     21.4     34.9
##  7 Adilabad Adilabad (Urban) Adilabad     21.6     34.9
##  8 Adilabad Adilabad (Urban) Adilabad     19.6     35.2
##  9 Adilabad Adilabad (Urban) Adilabad     19.3     35.4
## 10 Adilabad Adilabad (Urban) Adilabad     19.3     34.8
## # ℹ 311,147 more rows
#%>%
  #summarise(mean_temp_min=mean(temp_min),mean_temp_max=mean(temp_max))
#print(grouped_data_1)
grouped_data_2 <-my_data %>%
group_by(District,Mandal,Location) %>%
  
summarise(humidity_min,humidity_max)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'District', 'Mandal', 'Location'. You can
## override using the `.groups` argument.
print(grouped_data_2)
## # A tibble: 311,157 × 5
## # Groups:   District, Mandal, Location [1,678]
##    District Mandal           Location humidity_min humidity_max
##    <chr>    <chr>            <chr>           <dbl>        <dbl>
##  1 Adilabad Adilabad (Urban) Adilabad         61.1         92.4
##  2 Adilabad Adilabad (Urban) Adilabad         56.2         92.1
##  3 Adilabad Adilabad (Urban) Adilabad         55.7         92.1
##  4 Adilabad Adilabad (Urban) Adilabad         57.5         90.9
##  5 Adilabad Adilabad (Urban) Adilabad         53.9         91.7
##  6 Adilabad Adilabad (Urban) Adilabad         56.4         91.7
##  7 Adilabad Adilabad (Urban) Adilabad         58           91.6
##  8 Adilabad Adilabad (Urban) Adilabad         53.3         91  
##  9 Adilabad Adilabad (Urban) Adilabad         49.5         90.5
## 10 Adilabad Adilabad (Urban) Adilabad         51.5         90.3
## # ℹ 311,147 more rows
grouped_data_3 <- my_data %>%
group_by(District,Mandal,Location) %>%
summarise(wind_speed)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'District', 'Mandal', 'Location'. You can
## override using the `.groups` argument.
print(grouped_data_3)
## # A tibble: 311,157 × 4
## # Groups:   District, Mandal, Location [1,678]
##    District Mandal           Location wind_speed
##    <chr>    <chr>            <chr>         <dbl>
##  1 Adilabad Adilabad (Urban) Adilabad        5.5
##  2 Adilabad Adilabad (Urban) Adilabad        6.7
##  3 Adilabad Adilabad (Urban) Adilabad        6.6
##  4 Adilabad Adilabad (Urban) Adilabad        5.7
##  5 Adilabad Adilabad (Urban) Adilabad        7.9
##  6 Adilabad Adilabad (Urban) Adilabad        6.5
##  7 Adilabad Adilabad (Urban) Adilabad        5.4
##  8 Adilabad Adilabad (Urban) Adilabad        9  
##  9 Adilabad Adilabad (Urban) Adilabad        9.6
## 10 Adilabad Adilabad (Urban) Adilabad       13  
## # ℹ 311,147 more rows
avg_temp_max <- mean(grouped_data_1$temp_max)
count_greater_avg_temp <- sum(grouped_data_1$temp_max >avg_temp_max)
total_count <- nrow(my_data)
prob_temp_max_great_than_avg <- (count_greater_avg_temp)/total_count
cat("Probability that temp_max is greater than avg_temp_max:", prob_temp_max_great_than_avg, "\n")
## Probability that temp_max is greater than avg_temp_max: 0.4912054
### The probability that temperature will be greater than average temperature has been calculated
avg_humid_max <-mean(grouped_data_2$humidity_max)
count_greater_avg_humid <- sum(grouped_data_2$humidity_max >avg_humid_max)
total_count <- nrow(my_data)
prob_humid_max_great_than_avg <- (count_greater_avg_humid)/total_count
cat("Probability that humid_max is greater than avg_humid_max:", prob_humid_max_great_than_avg, "\n")
## Probability that humid_max is greater than avg_humid_max: 0.5622274
### The probability that humidity will be greater than average humidity has been calculated
avg_wind_speed <-mean(grouped_data_3$wind_speed)
count_greater_avg_wind_speed <- sum(grouped_data_3$wind_speed>avg_wind_speed)
total_count <- nrow(my_data)
prob_wind_speed_great_than_avg <- (count_greater_avg_wind_speed)/total_count
cat("Probability that wind_speed is greater than avg_wind_speed:", prob_wind_speed_great_than_avg, "\n")
## Probability that wind_speed is greater than avg_wind_speed: 0.4472405
### The probability that wind speed will be greater than average winds peed has been calculated
my_prob <- c(prob_humid_max_great_than_avg,prob_temp_max_great_than_avg,prob_wind_speed_great_than_avg)
lowest_prob_index <- which.min(my_prob)
tags <- rep("normal", length(my_prob))
tags[lowest_prob_index] <- "anomaly"
print(tags)
## [1] "normal"  "normal"  "anomaly"
### The lowest probability value has been indicated as an anomaly 
# plotting the bar graph 
ggplot(data=grouped_data_1)+
 geom_bar(mapping=aes(x=District))+
 theme(axis.text.x = element_text(angle = 45, hjust = 1))

# plotting the scatterplot graph 
ggplot(data=my_data)+
 geom_point(mapping=aes(x=temp_max,y=humidity_max))

# plotting the boxplot
ggplot(data=grouped_data_3)+
 geom_boxplot(mapping=aes(x=District,y=wind_speed,fill = District))+
 theme(axis.text.x = element_text(angle = 45, hjust = 1))