# Importing the packages and reading the data set
library(readr)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ stringr 1.5.0
## ✔ forcats 1.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(pwr)
library(effsize)
my_data <- read_delim("C:/Users/user/Documents/Statistics/Telangana_2018_complete_weather_data.csv",delim=",")
## Rows: 230384 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): District, Mandal, Location, Date
## dbl (6): row_id, temp_min, temp_max, humidity_min, humidity_max, wind_speed
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
total_rows <- nrow(my_data)
sample_size <- round(0.5 * total_rows)
set.seed(2)
rand_sample_1<-sample(1:total_rows,sample_size,replace=T)
set.seed(4)
rand_sample_2<-sample(1:total_rows,sample_size,replace=T)
df_1<- data.frame(my_data[rand_sample_1,])
df_2<- data.frame(my_data[rand_sample_2,])
temp_max_1 <-df_1$temp_max
temp_max_2 <-df_2$temp_max
H0: There is no significant difference between mean of temp_max 1 and temp_max 2
H1: There is a significant difference between mean of temp_max 1 and temp_max 2
summary(temp_max_1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22.30 31.80 34.60 34.76 37.70 45.40
summary(temp_max_2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22.20 31.80 34.60 34.78 37.70 45.40
t_test_result <- t.test(temp_max_1, temp_max_2)
alpha <- 0.05
power <- 0.5
effect_size <- 0.5
critical_t_value <- qnorm(1 - alpha / 2)
observed_t_value <- t_test_result$statistic
p_value <- t_test_result$p.value
cat("P-value:", p_value, "\n")
## P-value: 0.3020998
if (abs(observed_t_value) <= critical_t_value) {
cat("Fail to reject the null hypothesis .\n")
} else {
cat("Reject the null hypothesis .\n")
}
## Fail to reject the null hypothesis .
# Set the parameters
alpha <- 0.05
power <- 0.5
min_effect_size <- 0.5
# Calculate the variances of the two groups
variance_temp_max_1 <- var(temp_max_1)
variance_temp_max_2 <- var(temp_max_2)
# Calculate the F-statistic
f_statistic <- variance_temp_max_1 / variance_temp_max_2
# Calculate the degrees of freedom
df1 <- length(temp_max_1) - 1
df2 <- length(temp_max_2) - 1
p_value <- pf(f_statistic, df1, df2, lower.tail = FALSE)
cat("Degrees of freedom (DF1, DF2):", df1, df2, "\n")
## Degrees of freedom (DF1, DF2): 115191 115191
cat("Fisher's F-test p-value:", p_value, "\n")
## Fisher's F-test p-value: 0.9570764
if (p_value < alpha) {
cat("Reject the null hypothesis \n")
} else {
cat("Fail to reject the null hypothesis \n")
}
## Fail to reject the null hypothesis
humid_max_1 <-df_1$humidity_max
humid_max_2 <-df_2$humidity_max
H0: There is no significant difference between mean of humid_max 1 and humid_max 2 H1: There is a significant difference between mean of humid_max 1 and humid_max 2
summary(humid_max_1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.00 71.00 83.10 81.09 94.70 100.00
summary(humid_max_2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.0 70.9 83.1 81.0 94.6 100.0
t_test_result <- t.test(humid_max_1, humid_max_2)
alpha <- 0.05
power <- 0.5
effect_size <- 0.5
critical_t_value <- qnorm(1 - alpha / 2)
observed_t_value <- t_test_result$statistic
p_value <- t_test_result$p.value
cat("P-value:", p_value, "\n")
## P-value: 0.1594279
if (abs(observed_t_value) <= critical_t_value) {
cat("Fail to reject the null hypothesis (H0).\n")
} else {
cat("Reject the null hypothesis (H0).\n")
}
## Fail to reject the null hypothesis (H0).
# Set the parameters
alpha <- 0.05
power <- 0.5
min_effect_size <- 0.5
# Calculate the variances of the two groups
variance_humid_max_1 <- var(humid_max_1)
variance_humid_max_2 <- var(humid_max_2)
# Calculate the F-statistic
f_statistic <- variance_humid_max_1 / variance_humid_max_2
# Calculate the degrees of freedom
df1 <- length(humid_max_1) - 1
df2 <- length(humid_max_2) - 1
p_value <- pf(f_statistic, df1, df2, lower.tail = FALSE)
cat("Degrees of freedom (DF1, DF2):", df1, df2, "\n")
## Degrees of freedom (DF1, DF2): 115191 115191
cat("Fisher's F-test p-value:", p_value, "\n")
## Fisher's F-test p-value: 0.9900585
if (p_value < alpha) {
cat("Reject the null hypothesis \n")
} else {
cat("Fail to reject the null hypothesis \n")
}
## Fail to reject the null hypothesis
# Create a box plot for temp_max
ggplot(data = data.frame(Group = rep(c("TempMax 1", "TempMax 2"), each = length(temp_max_1)), TempMax = c(temp_max_1, temp_max_2))) +
geom_boxplot(aes(x = Group, y = TempMax, fill = Group)) +
labs(title = "Box Plot for temp_max Comparison",
x = "Groups",
y = "temp_max") +
theme_minimal()
# Create a box plot for humidity_max
ggplot(data = data.frame(Group = rep(c("HumidMax 1", "HumidMax 2"), each = length(humid_max_1)), HumidMax = c(humid_max_1, humid_max_2))) +
geom_boxplot(aes(x = Group, y = HumidMax, fill = Group)) +
labs(title = "Box Plot for humidity_max Comparison",
x = "Groups",
y = "humidity_max") +
theme_minimal()