setwd(dir="C:\\Users\\Admin\\Dropbox\\PC\\Desktop\\IIMK-BA\\Hypothesis Testing-Jayalaxmi Agro Tech")
library(readxl)
library(janitor)
## Warning: package 'janitor' was built under R version 4.2.1
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(ggplot2)
Data <- read_xlsx("IMB733-XLS-ENG.xlsx",sheet = "DataSheet",col_names = T)
Belagavi <- read_xlsx("IMB733-XLS-ENG.xlsx",sheet = "Belagavi_weather",col_names = T)
Dharwad <- read_xlsx("IMB733-XLS-ENG.xlsx",sheet = "Dharwad_weather",col_names = T)
Belagavi <- clean_names(Belagavi)
Dharwad <- clean_names(Dharwad)
str(Data)
## tibble [123 × 26] (S3: tbl_df/tbl/data.frame)
## $ Month-Year : POSIXct[1:123], format: "2015-06-01" "2015-07-01" ...
## $ Week : chr [1:123] "Week4" "Week1" "Week2" "Week3" ...
## $ No of users : num [1:123] 2 1 1 4 6 12 13 10 7 12 ...
## $ Usage : num [1:123] 4 1 25 70 100 291 225 141 148 215 ...
## $ D1 : num [1:123] 0 0 0 4 1 12 7 4 1 5 ...
## $ D2 : num [1:123] 0 0 1 2 1 6 5 4 0 3 ...
## $ D3 : num [1:123] 1 0 2 3 0 11 6 8 1 6 ...
## $ D4 : num [1:123] 0 0 2 4 2 4 2 4 5 3 ...
## $ D5 : num [1:123] 0 0 0 2 0 5 5 3 4 6 ...
## $ D6 : num [1:123] 0 0 0 4 2 15 6 5 5 12 ...
## $ D7 : num [1:123] 0 0 2 1 7 7 4 8 3 7 ...
## $ D8 : num [1:123] 0 0 3 7 9 7 5 7 3 33 ...
## $ D9 : num [1:123] 0 0 2 3 2 6 6 6 3 11 ...
## $ D10 : num [1:123] 0 0 1 0 1 10 1 4 2 3 ...
## $ D11 : num [1:123] 0 0 1 3 4 12 6 3 5 8 ...
## $ V1 : num [1:123] 0 0 0 5 11 26 28 18 18 20 ...
## $ V2 : num [1:123] 0 1 0 4 8 20 19 11 16 15 ...
## $ V3 : num [1:123] 0 0 1 2 5 12 13 8 7 10 ...
## $ V4 : num [1:123] 0 0 1 2 5 13 8 7 6 6 ...
## $ V5 : num [1:123] 2 0 1 1 9 16 9 4 15 6 ...
## $ V6 : num [1:123] 0 0 1 3 3 22 14 4 10 10 ...
## $ V7 : num [1:123] 0 0 0 3 7 21 13 5 4 9 ...
## $ V8 : num [1:123] 0 0 0 2 6 14 9 5 10 7 ...
## $ V9 : num [1:123] 0 0 0 4 7 16 20 10 9 8 ...
## $ V10 : num [1:123] 0 0 1 4 7 23 17 5 14 10 ...
## $ Micronutrient: num [1:123] 1 0 6 7 3 13 22 8 7 17 ...
Data<- clean_names(Data)
Data1 <- subset(Data,Data$month_year>="2017-10-01")
Res1 <- t.test(Data1$d6,mu =60,alternative = "greater")
Res1
##
## One Sample t-test
##
## data: Data1$d6
## t = 2.341, df = 28, p-value = 0.01329
## alternative hypothesis: true mean is greater than 60
## 95 percent confidence interval:
## 62.29976 Inf
## sample estimates:
## mean of x
## 68.41379
Data$group <- factor(ifelse(Data$month_year >="2017-01-01","2017-18","2015-16"))
str(Data)
## tibble [123 × 27] (S3: tbl_df/tbl/data.frame)
## $ month_year : POSIXct[1:123], format: "2015-06-01" "2015-07-01" ...
## $ week : chr [1:123] "Week4" "Week1" "Week2" "Week3" ...
## $ no_of_users : num [1:123] 2 1 1 4 6 12 13 10 7 12 ...
## $ usage : num [1:123] 4 1 25 70 100 291 225 141 148 215 ...
## $ d1 : num [1:123] 0 0 0 4 1 12 7 4 1 5 ...
## $ d2 : num [1:123] 0 0 1 2 1 6 5 4 0 3 ...
## $ d3 : num [1:123] 1 0 2 3 0 11 6 8 1 6 ...
## $ d4 : num [1:123] 0 0 2 4 2 4 2 4 5 3 ...
## $ d5 : num [1:123] 0 0 0 2 0 5 5 3 4 6 ...
## $ d6 : num [1:123] 0 0 0 4 2 15 6 5 5 12 ...
## $ d7 : num [1:123] 0 0 2 1 7 7 4 8 3 7 ...
## $ d8 : num [1:123] 0 0 3 7 9 7 5 7 3 33 ...
## $ d9 : num [1:123] 0 0 2 3 2 6 6 6 3 11 ...
## $ d10 : num [1:123] 0 0 1 0 1 10 1 4 2 3 ...
## $ d11 : num [1:123] 0 0 1 3 4 12 6 3 5 8 ...
## $ v1 : num [1:123] 0 0 0 5 11 26 28 18 18 20 ...
## $ v2 : num [1:123] 0 1 0 4 8 20 19 11 16 15 ...
## $ v3 : num [1:123] 0 0 1 2 5 12 13 8 7 10 ...
## $ v4 : num [1:123] 0 0 1 2 5 13 8 7 6 6 ...
## $ v5 : num [1:123] 2 0 1 1 9 16 9 4 15 6 ...
## $ v6 : num [1:123] 0 0 1 3 3 22 14 4 10 10 ...
## $ v7 : num [1:123] 0 0 0 3 7 21 13 5 4 9 ...
## $ v8 : num [1:123] 0 0 0 2 6 14 9 5 10 7 ...
## $ v9 : num [1:123] 0 0 0 4 7 16 20 10 9 8 ...
## $ v10 : num [1:123] 0 0 1 4 7 23 17 5 14 10 ...
## $ micronutrient: num [1:123] 1 0 6 7 3 13 22 8 7 17 ...
## $ group : Factor w/ 2 levels "2015-16","2017-18": 1 1 1 1 1 1 1 1 1 1 ...
Data$group <- relevel(Data$group,ref = "2017-18")
t_test <- t.test(Data$no_of_users~Data$group,alternative = "greater",var.eq=T)
t_test
##
## Two Sample t-test
##
## data: Data$no_of_users by Data$group
## t = 9.2567, df = 121, p-value = 4.753e-16
## alternative hypothesis: true difference in means between group 2017-18 and group 2015-16 is greater than 0
## 95 percent confidence interval:
## 107.5685 Inf
## sample estimates:
## mean in group 2017-18 mean in group 2015-16
## 181.10000 50.06849
inorder to examine the stated hypothesis, the study applied an indpendent sample t-test In this test, we compared 2017-2018 average weekly estimates against 2015-2016 average weekly users. The test resutls supported t = 9.2567, df = 121, p-value = 4.753e-16. This indicates that the associated p value of the test is less than that of alpha (0.05), hence we reject the null hypothesis, and infer that average weekly users during 2017-2018 is higher than that of 2015-2016
check whether app usage is same or different across the four weeks of a month. Anand claims that app usage picked up after January 2016; so, test this hypothesis using data from January-2016 – May 2018
Data4 <- filter(Data,Data$month_year >= "2016-01-01")
Data5<- select(Data4,week,usage)
Data5$week<- as.factor(Data5$week)
levels(Data5$week)
## [1] "Week1" "Week2" "Week3" "Week4"
anova <- aov(usage~week,data=Data5)
summary(anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## week 3 1675404 558468 2.319 0.0804 .
## Residuals 94 22633380 240781
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model.tables(anova,type = "mean")
## Tables of means
## Grand mean
##
## 657.7041
##
## week
## Week1 Week2 Week3 Week4
## 622.2 583.8 541.5 875.8
## rep 25.0 24.0 24.0 25.0
H0: usage didnt pick up after January 2016 (Mu1 >= Mu2) Ha: usage picked up after January 2016 (Mu1 < Mu2)
Data$group <- factor(ifelse(Data$month_year >="2015-12-31","2016Onward","Before2016"))
Data$group <- relevel(Data$group,ref ="2016Onward" )
Data7<- select(Data,usage,group)
t.test(Data7$usage~Data7$group,alternative = "greater",var.eq=TRUE)
##
## Two Sample t-test
##
## data: Data7$usage by Data7$group
## t = 3.5721, df = 121, p-value = 0.0002547
## alternative hypothesis: true difference in means between group 2016Onward and group Before2016 is greater than 0
## 95 percent confidence interval:
## 198.3213 Inf
## sample estimates:
## mean in group 2016Onward mean in group Before2016
## 657.7041 287.6800
Data8 <- Data %>% filter(month_year >= "2016-08-01")
Data8$month <- month(as.IDate(Data8$month_year, '%d/%m/%Y'))
## Warning in as.POSIXlt.POSIXct(x, tz = tz): unknown timezone '%d/%m/%Y'
Data9 <- Data8 %>% select(month_year,usage) %>% group_by(month_year)%>% summarise(Mean_usage = mean(usage))
UsagePlot <- ggplot(data= Data9, aes(x=month_year, y=Mean_usage, group=1)) +
geom_line(color="blue", size=1.5)+
geom_point(color="red", size=3)
UsagePlot
then the access of that disease should be more in the months having suitable weather conditions. Help the analyst in coming up with a statistical test to support the claim for two districts for which the sample of weather and disease access data is provided in the data sheet. Identify the diseases for which you can support this claim. Test this claim both for temperature and relative humidity at 95% confidence.
H0: µ2 <= µ1 (Avg disease access when conditions are not favorable to disease = µ1, µ2 when favorable) ### Ha : µ2 > µ1
Q5BelD1 <- Belagavi
Q5BelD1$Fcond<- factor(ifelse(Q5BelD1$temperature >=20 & Q5BelD1$temperature <=24 & Q5BelD1$relative_humidity >80,"Yes","No"))
Q5BelD1$Fcond<-relevel(Q5BelD1$Fcond,ref = "Yes")
Q5DharD1<- Dharwad
Q5DharD1$Fcond <- factor(ifelse(Q5DharD1$temperature >=20 & Q5DharD1$temperature <= 24 & Q5DharD1$relative_humidity >80,"Yes","No"))
Q5DharD1$Fcond <- relevel(Q5DharD1$Fcond,ref = "Yes")
TestD1Bel <- t.test(d1~Fcond, data = Q5BelD1,alternative="greater",var.eq=TRUE )
TestD1Bel
##
## Two Sample t-test
##
## data: d1 by Fcond
## t = 2.7605, df = 22, p-value = 0.005707
## alternative hypothesis: true difference in means between group Yes and group No is greater than 0
## 95 percent confidence interval:
## 9.704442 Inf
## sample estimates:
## mean in group Yes mean in group No
## 37.59305 11.91669
TestD1Dhar <- t.test(d1~Fcond, data = Q5DharD1,alternative="greater",var.eq=TRUE )
TestD1Dhar
##
## Two Sample t-test
##
## data: d1 by Fcond
## t = 4.5934, df = 20, p-value = 8.801e-05
## alternative hypothesis: true difference in means between group Yes and group No is greater than 0
## 95 percent confidence interval:
## 15.66022 Inf
## sample estimates:
## mean in group Yes mean in group No
## 31.590651 6.515126
Q5BelD2 <- Belagavi
Q5BelD2$Fcond<- factor(ifelse(Q5BelD2$temperature >=21.5 & Q5BelD2$temperature <=24.5 & Q5BelD2$relative_humidity >83,"Yes","No"))
Q5BelD2$Fcond<-relevel(Q5BelD2$Fcond,ref = "Yes")
Q5DharD2<- Dharwad
Q5DharD2$Fcond <- factor(ifelse(Q5DharD2$temperature >=21.5 & Q5DharD2$temperature <= 24.5 & Q5DharD2$relative_humidity >83,"Yes","No"))
Q5DharD2$Fcond <- relevel(Q5DharD2$Fcond,ref = "Yes")
TestD2Bel <- t.test(d2~Fcond, data = Q5BelD2,alternative="greater",var.eq=TRUE )
TestD2Bel
##
## Two Sample t-test
##
## data: d2 by Fcond
## t = 3.7247, df = 22, p-value = 0.0005887
## alternative hypothesis: true difference in means between group Yes and group No is greater than 0
## 95 percent confidence interval:
## 10.89113 Inf
## sample estimates:
## mean in group Yes mean in group No
## 29.380223 9.173547
TestD2Dhar <- t.test(d2~Fcond, data = Q5DharD2,alternative="greater",var.eq=TRUE )
TestD2Dhar
##
## Two Sample t-test
##
## data: d2 by Fcond
## t = 4.0726, df = 20, p-value = 0.0002968
## alternative hypothesis: true difference in means between group Yes and group No is greater than 0
## 95 percent confidence interval:
## 19.62338 Inf
## sample estimates:
## mean in group Yes mean in group No
## 40.134921 6.096486
Q5BelD3 <- Belagavi
Q5BelD3$Fcond<- factor(ifelse(Q5BelD3$temperature >=22 & Q5BelD3$temperature <=24 ,"Yes","No"))
Q5BelD3$Fcond<-relevel(Q5BelD3$Fcond,ref = "Yes")
Q5DharD3<- Dharwad
Q5DharD3$Fcond <- factor(ifelse(Q5DharD3$temperature >=22 & Q5DharD3$temperature <= 24 ,"Yes","No"))
Q5DharD3$Fcond <- relevel(Q5DharD3$Fcond,ref = "Yes")
TestD3Bel <- t.test(d3~Fcond, data = Q5BelD3,alternative="greater",var.eq=TRUE )
TestD3Bel
##
## Two Sample t-test
##
## data: d3 by Fcond
## t = 2.2224, df = 22, p-value = 0.01843
## alternative hypothesis: true difference in means between group Yes and group No is greater than 0
## 95 percent confidence interval:
## 4.39784 Inf
## sample estimates:
## mean in group Yes mean in group No
## 30.95773 11.61233
TestD3Dhar <- t.test(d3~Fcond, data = Q5DharD3,alternative="greater",var.eq=TRUE )
TestD3Dhar
##
## Two Sample t-test
##
## data: d3 by Fcond
## t = 1.5057, df = 20, p-value = 0.07389
## alternative hypothesis: true difference in means between group Yes and group No is greater than 0
## 95 percent confidence interval:
## -4.118138 Inf
## sample estimates:
## mean in group Yes mean in group No
## 40.26971 11.96166
Q5BelD4 <- Belagavi
Q5BelD4$Fcond<- factor(ifelse(Q5BelD4$temperature >=22 & Q5BelD4$temperature <=26 & Q5BelD4$relative_humidity>85 ,"Yes","No"))
Q5BelD4$Fcond<-relevel(Q5BelD4$Fcond,ref = "Yes")
Q5DharD4<- Dharwad
Q5DharD4$Fcond <- factor(ifelse(Q5DharD4$temperature >=22 & Q5DharD4$temperature <= 26 & Q5DharD4$relative_humidity>85,"Yes","No"))
Q5DharD4$Fcond <- relevel(Q5DharD4$Fcond,ref = "Yes")
TestD4Bel <- t.test(d4~Fcond, data = Q5BelD4,alternative="greater",var.eq=TRUE )
TestD4Bel
##
## Two Sample t-test
##
## data: d4 by Fcond
## t = 1.793, df = 22, p-value = 0.04337
## alternative hypothesis: true difference in means between group Yes and group No is greater than 0
## 95 percent confidence interval:
## 0.4785112 Inf
## sample estimates:
## mean in group Yes mean in group No
## 24.28984 12.97384
TestD4Dhar <- t.test(d4~Fcond, data = Q5DharD4,alternative="greater",var.eq=TRUE )
TestD4Dhar
##
## Two Sample t-test
##
## data: d4 by Fcond
## t = 2.3147, df = 20, p-value = 0.01569
## alternative hypothesis: true difference in means between group Yes and group No is greater than 0
## 95 percent confidence interval:
## 6.896259 Inf
## sample estimates:
## mean in group Yes mean in group No
## 39.16667 12.10875
Q5BelD5 <- Belagavi
Q5BelD5$Fcond<- factor(ifelse(Q5BelD5$temperature >=22 & Q5BelD5$temperature <=24.5 & Q5BelD5$relative_humidity>77 & Q5BelD5$relative_humidity <85,"Yes","No"))
Q5BelD5$Fcond<-relevel(Q5BelD5$Fcond,ref = "Yes")
Q5DharD5<- Dharwad
Q5DharD5$Fcond <- factor(ifelse(Q5DharD5$temperature >=22 & Q5DharD5$temperature <= 24.5 & Q5DharD5$relative_humidity>77 & Q5DharD5$relative_humidity<85,"Yes","No"))
Q5DharD5$Fcond <- relevel(Q5DharD5$Fcond,ref = "Yes")
TestD5Bel <- t.test(d5~Fcond, data = Q5BelD5,alternative="greater",var.eq=TRUE )
TestD5Bel
##
## Two Sample t-test
##
## data: d5 by Fcond
## t = 3.6675, df = 22, p-value = 0.0006761
## alternative hypothesis: true difference in means between group Yes and group No is greater than 0
## 95 percent confidence interval:
## 13.85781 Inf
## sample estimates:
## mean in group Yes mean in group No
## 36.57407 10.51547
TestD5Dhar <- t.test(d5~Fcond, data = Q5DharD5,alternative="greater",var.eq=TRUE )
TestD5Dhar
##
## Two Sample t-test
##
## data: d5 by Fcond
## t = 0.10853, df = 20, p-value = 0.4573
## alternative hypothesis: true difference in means between group Yes and group No is greater than 0
## 95 percent confidence interval:
## -16.53381 Inf
## sample estimates:
## mean in group Yes mean in group No
## 14.17749 13.06725
Q5BelD7 <- Belagavi
Q5BelD7$Fcond<- factor(ifelse(Q5BelD7$temperature >25 & Q5BelD7$relative_humidity >80 ,"Yes","No"))
Q5BelD7$Fcond<-relevel(Q5BelD7$Fcond,ref = "Yes")
Q5DharD7<- Dharwad
Q5DharD7$Fcond <- factor(ifelse(Q5DharD7$temperature >25 & Q5DharD7$relative_humidity >80 ,"Yes","No"))
Q5DharD7$Fcond <- relevel(Q5DharD7$Fcond,ref = "Yes")
TestD7Bel <- t.test(d7~Fcond, data = Q5BelD7,alternative="greater",var.eq=TRUE )
TestD7Bel
##
## Two Sample t-test
##
## data: d7 by Fcond
## t = 3.4275, df = 22, p-value = 0.001204
## alternative hypothesis: true difference in means between group Yes and group No is greater than 0
## 95 percent confidence interval:
## 25.65723 Inf
## sample estimates:
## mean in group Yes mean in group No
## 72.42328 21.00642
TestD7Dhar <- t.test(d7~Fcond, data = Q5DharD7,alternative="greater",var.eq=TRUE )
TestD7Dhar
##
## Two Sample t-test
##
## data: d7 by Fcond
## t = 0.72663, df = 20, p-value = 0.2379
## alternative hypothesis: true difference in means between group Yes and group No is greater than 0
## 95 percent confidence interval:
## -20.73009 Inf
## sample estimates:
## mean in group Yes mean in group No
## 35.00000 19.90822