# Importing the packages and reading the data set
library(readr)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ stringr   1.5.0
## ✔ forcats   1.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(pwr)
library(effsize)
my_data <- read_delim("C:/Users/user/Documents/Statistics/Telangana_2018_complete_weather_data.csv",delim=",")
## Rows: 230384 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): District, Mandal, Location,  Date
## dbl (6): row_id, temp_min, temp_max, humidity_min, humidity_max, wind_speed
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
total_rows <- nrow(my_data)
sample_size <- round(0.5 * total_rows)

set.seed(2)
rand_sample_1<-sample(1:total_rows,sample_size,replace=T)

set.seed(4)
rand_sample_2<-sample(1:total_rows,sample_size,replace=T)
df_1<- data.frame(my_data[rand_sample_1,])
df_2<- data.frame(my_data[rand_sample_2,])

I had extracted two random samples form the temp_max column from data set

temp_max_1 <-df_1$temp_max
temp_max_2 <-df_2$temp_max

Hypothesis

H0: There is no significant difference between mean of temp_max 1 and temp_max 2

H1: There is a significant difference between mean of temp_max 1 and temp_max 2

summary(temp_max_1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   22.30   31.80   34.60   34.76   37.70   45.40
summary(temp_max_2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   22.20   31.80   34.60   34.78   37.70   45.40

Alpha Value (Significance Level) is 0.05 (5%).It corresponds to a 5% chance of making a Type I error

Power is 0.50 (50%). It is the probability of correctly rejecting the null hypothesis when it’s false.It also means we have an 50% chance of detecting a true effect if it exists.

Minimum Effect Size is 0.50.It represents the smallest effect size that we consider practically significant.

t_test_result <- t.test(temp_max_1, temp_max_2)


alpha <- 0.05

power <- 0.5

effect_size <- 0.5

critical_t_value <- qnorm(1 - alpha / 2)

observed_t_value <- t_test_result$statistic
p_value <- t_test_result$p.value


cat("P-value:", p_value, "\n")
## P-value: 0.3020998
if (abs(observed_t_value) <= critical_t_value) {
  cat("Fail to reject the null hypothesis .\n")
  
} else {
  cat("Reject the null hypothesis .\n")
  
}
## Fail to reject the null hypothesis .
# Set the parameters
alpha <- 0.05
power <- 0.5
min_effect_size <- 0.5
# Calculate the variances of the two groups
variance_temp_max_1 <- var(temp_max_1)
variance_temp_max_2 <- var(temp_max_2)

# Calculate the F-statistic
f_statistic <- variance_temp_max_1 / variance_temp_max_2

# Calculate the degrees of freedom
df1 <- length(temp_max_1) - 1
df2 <- length(temp_max_2) - 1



p_value <- pf(f_statistic, df1, df2, lower.tail = FALSE)



cat("Degrees of freedom (DF1, DF2):", df1, df2, "\n")
## Degrees of freedom (DF1, DF2): 115191 115191
cat("Fisher's F-test p-value:", p_value, "\n")
## Fisher's F-test p-value: 0.9570764
if (p_value < alpha) {
  cat("Reject the null hypothesis \n")
} else {
  cat("Fail to reject the null hypothesis \n")
}
## Fail to reject the null hypothesis

I performed two different hypothesis tests i.e., Neyman-Pearson hypothesis test and Fisher’s style test. Both are Failed to reject the null hypothesis.my null hypothesis is “There is no significant difference between mean of temp_max 1 and temp_max 2”.The two samples are having the mean 34.76 and 34.78. Hence Failing to reject the null hypothesis states that both samples are having approximately same mean.

humid_max_1 <-df_1$humidity_max
humid_max_2 <-df_2$humidity_max

I had extracted two random samples form the humidity_max column from data set

Hypothesis

H0: There is no significant difference between mean of humid_max 1 and humid_max 2 H1: There is a significant difference between mean of humid_max 1 and humid_max 2

summary(humid_max_1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.00   71.00   83.10   81.09   94.70  100.00
summary(humid_max_2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     4.0    70.9    83.1    81.0    94.6   100.0

Alpha Value (Significance Level) is 0.05 (5%).It corresponds to a 5% chance of making a Type I error

Power is 0.50 (50%). It is the probability of correctly rejecting the null hypothesis when it’s false.It also means we have an 50% chance of detecting a true effect if it exists.

Minimum Effect Size is 0.50.It represents the smallest effect size that we consider practically significant.

t_test_result <- t.test(humid_max_1, humid_max_2)
alpha <- 0.05
power <- 0.5
effect_size <- 0.5
critical_t_value <- qnorm(1 - alpha / 2)

observed_t_value <- t_test_result$statistic
p_value <- t_test_result$p.value

cat("P-value:", p_value, "\n")
## P-value: 0.1594279
if (abs(observed_t_value) <= critical_t_value) {
  cat("Fail to reject the null hypothesis (H0).\n")
  
} else {
  cat("Reject the null hypothesis (H0).\n")
  
}
## Fail to reject the null hypothesis (H0).
# Set the parameters
alpha <- 0.05
power <- 0.5
min_effect_size <- 0.5
# Calculate the variances of the two groups
variance_humid_max_1 <- var(humid_max_1)
variance_humid_max_2 <- var(humid_max_2)

# Calculate the F-statistic
f_statistic <- variance_humid_max_1 / variance_humid_max_2

# Calculate the degrees of freedom
df1 <- length(humid_max_1) - 1
df2 <- length(humid_max_2) - 1



p_value <- pf(f_statistic, df1, df2, lower.tail = FALSE)



cat("Degrees of freedom (DF1, DF2):", df1, df2, "\n")
## Degrees of freedom (DF1, DF2): 115191 115191
cat("Fisher's F-test p-value:", p_value, "\n")
## Fisher's F-test p-value: 0.9900585
if (p_value < alpha) {
  cat("Reject the null hypothesis \n")
} else {
  cat("Fail to reject the null hypothesis \n")
}
## Fail to reject the null hypothesis

I performed two different hypothesis tests i.e., Neyman-Pearson hypothesis test and Fisher’s style test. Both are Failed to reject the null hypothesis.my null hypothesis is “There is no significant difference between mean of humid_max 1 and humid_max 2”.The two samples are having the mean 81.09 and 81.0. Hence Failing to reject the null hypothesis states that both samples are having approximately same mean.

# Create a box plot for temp_max
ggplot(data = data.frame(Group = rep(c("TempMax 1", "TempMax 2"), each = length(temp_max_1)), TempMax = c(temp_max_1, temp_max_2))) +
  geom_boxplot(aes(x = Group, y = TempMax, fill = Group)) +
  labs(title = "Box Plot for temp_max Comparison",
       x = "Groups",
       y = "temp_max") +
  theme_minimal()

# Create a box plot for humidity_max
ggplot(data = data.frame(Group = rep(c("HumidMax 1", "HumidMax 2"), each = length(humid_max_1)), HumidMax = c(humid_max_1, humid_max_2))) +
  geom_boxplot(aes(x = Group, y = HumidMax, fill = Group)) +
  labs(title = "Box Plot for humidity_max Comparison",
       x = "Groups",
       y = "humidity_max") +
  theme_minimal()

The above two box plots i.e., temp_max and humidity_max are showing similar central tendencies and spreads, they support the statistical hypothesis test’ results that failed to reject the null hypothesis. It suggests that there is no significant difference between the means of samples of two columns.I can also conclude that samples in my dataset are having approximately same statistical values i.e. mean