# Importing the packages and reading the data set
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
my_data <- read_delim("C:/Users/user/Documents/Statistics/Telangana_2018_complete_weather_data.csv",delim=",")
## Rows: 230384 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): District, Mandal, Location,  Date
## dbl (6): row_id, temp_min, temp_max, humidity_min, humidity_max, wind_speed
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

1. Build at least three sets of variable combinations

For each set of variables, include at least one column that you created (i.e., calculated based on others)
All variables for this data dive should be either continuous (i.e., numeric) or ordered (e.g., [‘small’, ‘medium’, ‘large’] is okay, but [“apples”, “oranges”, “bananas”] is not)
For each set, there should be one response variable with the others as explanatory variables
# creating a variable_set_1
variable_set_1 <- my_data %>%
group_by(temp_min,temp_max) %>%
summarise(avg_temp=(temp_min+temp_max)/2)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'temp_min', 'temp_max'. You can override
## using the `.groups` argument.
summary(variable_set_1)
##     temp_min        temp_max        avg_temp    
##  Min.   : 5.00   Min.   :22.00   Min.   :17.05  
##  1st Qu.:20.20   1st Qu.:31.80   1st Qu.:26.15  
##  Median :23.50   Median :34.60   Median :28.60  
##  Mean   :22.51   Mean   :34.76   Mean   :28.63  
##  3rd Qu.:25.40   3rd Qu.:37.70   3rd Qu.:31.10  
##  Max.   :34.60   Max.   :45.40   Max.   :39.80
# Adding new column to  variable_set_1
variable_set_1$Temperature_Range <- cut(variable_set_1$avg_temp, 
                              breaks = c(-Inf, 26, 31, Inf), 
                              labels = c('Low_temp', 'Moderate_Temp', 'High_Temp'),
                              ordered_result = TRUE)
view(variable_set_1)
# creating a variable_set_2
variable_set_2 <-my_data %>%
  group_by(humidity_min,humidity_max) %>%
summarise(avg_humid=(humidity_min + humidity_max)/2)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'humidity_min', 'humidity_max'. You can
## override using the `.groups` argument.
summary(variable_set_2)
##   humidity_min   humidity_max      avg_humid    
##  Min.   : 0.1   Min.   :  4.00   Min.   :11.35  
##  1st Qu.:24.6   1st Qu.: 70.90   1st Qu.:48.95  
##  Median :36.9   Median : 83.10   Median :60.25  
##  Mean   :41.3   Mean   : 81.08   Mean   :61.19  
##  3rd Qu.:55.5   3rd Qu.: 94.70   3rd Qu.:73.35  
##  Max.   :99.9   Max.   :100.00   Max.   :99.95
# Adding new column to  variable_set_2
variable_set_2$humidity_Range <- cut(variable_set_2$avg_humid, 
                              breaks = c(-Inf, 49, 73, Inf), 
                              labels = c('Low_Humidity', 'Moderate_Humidity', 'High_Humidity'),
                              ordered_result = TRUE)
view(variable_set_2)
# creating a variable_set_3
variable_set_3 <- my_data %>%
summarise(wind_speed=(wind_speed)/1)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
summary(variable_set_3$wind_speed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.10    6.80   10.20   10.82   14.30   69.00
# Adding new column to  variable_set_3
variable_set_3$wind_speed_Range <- cut(variable_set_3$wind_speed, 
                              breaks = c(-Inf,7, 14, Inf), 
                              labels = c('Low_Wind', 'Moderate_Wind', 'High_Wind'),
                              ordered_result = TRUE)
view(variable_set_3)

2.Plot a visualization for each response-explanatory relationship, and draw some conclusions based on the plot

Use what we’ve covered so far in class to scrutinize the plot (e.g., are there any outliers?)
# Scatter plot for Temperature_Range vs. avg_temp
ggplot(variable_set_1, aes(x = avg_temp, y = Temperature_Range, fill = Temperature_Range)) +
  geom_point(aes(color = Temperature_Range)) +
  labs(x = "Average Temperature", y = "Temperature Range") +
  ggtitle("Temperature Range vs. Average Temperature")

The above scatterplot(Temperature Range vs Average Temperature)depicts that most of the data points are falling in low temperature and high temperature zones.As the average temperature increases data points tend to fall on high temperature zone. Moreover,The data does not show any significant outliers, indicating that temperature data is consistent.
# Scatter plot for humidity_Range vs. avg_humid
ggplot(variable_set_2, aes(x = avg_humid, y = humidity_Range, fill = humidity_Range)) +
  geom_point(aes(color = humidity_Range)) +
  labs(x = "Average Humidity", y = "Humidity Range") +
  ggtitle("Humidity Range vs. Average Humidity")

The above scatterplot(Humidity Range vs Average Humidity)describes that there is a relationship between average humidity and humidity range. As the average humidity values increases they tend to fall into High_Humidity range.Moreover,The scatter plot does not indicate any major outliers , indicating relatively consistent humidity levels.
# Scatter plot for wind_speed_Range vs. wind_speed
ggplot(variable_set_3, aes(x = wind_speed, y = wind_speed_Range, fill = wind_speed_Range)) +
  geom_point(aes(color = wind_speed_Range)) +
  labs(x = "Wind Speed", y = "Wind Speed Range") +
  ggtitle("Wind Speed Range vs. Wind Speed")

The above scatterplot(Wind Speed Range vs Wind Speed) indicates that the majority of data points fall into the Low_Wind range, indicating that low wind speeds are common in the dataset.There are some outliers with very high wind speeds that fall into the High_Wind range. These outliers may represent extreme weather conditions or measurement errors.

3.Calculate the appropriate correlation coefficient for each of these combinations

Explain why the value makes sense (or doesn’t) based on the visualization(s)
cor(variable_set_1$avg_temp, as.numeric(variable_set_1$Temperature_Range), method = "pearson")
## [1] 0.9138537
The correlation coefficient between average temperature and temperature range is 0.9138537 which is close to 1.It indicates a strong positive correlation which means as the average temperature increases, it tends to fall into the High_Temp range.
cor(variable_set_2$avg_humid, as.numeric(variable_set_2$humidity_Range), method = "pearson")
## [1] 0.9126292
The correlation coefficient between average humidity and humidity range is 0.9126292 which is close to 1.It indicates a positive correlation which means as the average humidity increases, it tends to fall into the High_Humidity range.
cor(variable_set_3$wind_speed, as.numeric(variable_set_3$wind_speed_Range), method = "pearson")
## [1] 0.8506889
The correlation coefficient between wind_speed and wind_speed range is 0.8506889.It indicates that the correlation is not so strong and their may be a non-linear relationship between variables.

4.Build a confidence interval for each of the response variables. Provide a detailed conclusion of the response variable (i.e., the population) based on your confidence interval.

# Confidence interval for Average Temperature
temp_range <- variable_set_1$avg_temp
conf_interval_temp_range <- t.test(temp_range)$conf.int
conf_interval_temp_range
## [1] 28.61777 28.64753
## attr(,"conf.level")
## [1] 0.95
The Confidence interval for Average Temperature indicates, we are 95% sure that data values of Average Temperature fall between lower bound i.e.28.61777 and upper bound i.e.28.64753.
# Confidence interval for Average Humidity
humid_range <- variable_set_2$avg_humid
conf_interval_humid_range <- t.test(humid_range)$conf.int
conf_interval_humid_range
## [1] 61.12038 61.25417
## attr(,"conf.level")
## [1] 0.95
The Confidence interval for Average Humidity indicates, we are 95% sure that data values of Average Humidity fall between lower bound i.e.61.12038 and upper bound i.e.61.25417.
# Confidence interval for Wind Speed 
wind_speed_Range <- variable_set_3$wind_speed
conf_interval_wind_speed_Range <- t.test(wind_speed_Range)$conf.int
conf_interval_wind_speed_Range
## [1] 10.79247 10.84307
## attr(,"conf.level")
## [1] 0.95
The Confidence interval for Wind Speed indicates, we are 95% sure that data values of Wind Speed fall between lower bound i.e.10.79247 and upper bound i.e.10.84307.