Data pre-processing

data1 <- data1 %>% rename(month.year = `Month-Year`)
data1$month.year <- as.Date(data1$month.year, format= "%Y-%m-%d")
data1 <- data1 %>% rename(user.number = `No of users`)
data1 $appusersfordiseaseinfo <- (D1+D2+D3+D4+D5+D6+D7+D8+D9+D10+D11)
data1$appusersforvarietiesinfo <- (V1+V2+V3+V4+V5+V6+V7+V8+V9+V10)
data1 $year <- as.numeric(format (data1$month.year, "%Y"))
data1 $month <- as.numeric(format(data1$month.year, "%m"))
month.number <- c(data1$month)
data1$month.name <- month.name[month.number]
data1$year <- as.factor(data1$year)
data1$month <- as.factor(data1$month)
data1$month.name <- as.factor(data1$month.name)

Q1

Anand, the cofounder of JAT, claims that disease 6 (leaf curl) information was accessed at least 60 times every week on average since October 2017 due to this disease outbreak. Test this claim at a significance level of 0.05 using an appropriate hypothesis test.

# Ho: D6 <= 60, Ha : D6 > 60

data2 <- data1 %>% select(month.year, Usage, D6) %>% filter(month.year >= "2017-10-01")
t.test(data2$D6, alternative = "greater", mu = 60)
## 
##  One Sample t-test
## 
## data:  data2$D6
## t = 2.341, df = 28, p-value = 0.01329
## alternative hypothesis: true mean is greater than 60
## 95 percent confidence interval:
##  62.29976      Inf
## sample estimates:
## mean of x 
##  68.41379
#since p value < 0.05, we reject Ho, go with Ha, D6 information was accessed at least 60 times every month on average since October 2017 

Q2

Q3

JAT believes that over the years, the average number of app users have increased significantly. Is there statistical evidence to support that the average number of users in year 2017-2018 is more than average number of users in year 2015-2016 at a=0.05? Support your answer with all necessary tests.

data4 <- data1 %>% select(user.number, year)
# Ho : avg users of 2017-2018 <= 2015-2016, 
# Ha : avg users of 2017-2018 > 2015-2016

t.test(x= data4$user.number[data4$year== "2017" | data4$year == "2018"], y= data4$user.number[data4$year == "2015" | data4$year == "2016"], data= data4, var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  data4$user.number[data4$year == "2017" | data4$year == "2018"] and data4$user.number[data4$year == "2015" | data4$year == "2016"]
## t = 9.2567, df = 121, p-value = 0.0000000000000009507
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  103.0074 159.0556
## sample estimates:
## mean of x mean of y 
## 181.10000  50.06849
#OR

data4.1 <- data4 %>% mutate(year.range = case_when(year %in% c("2017", "2018") ~ "2017-2018", year %in%  c("2015", "2016") ~ "2015-2016"))
# here mutate and case_when syntax is different since we are creating new coulmn from a categorical column.
t.test (data4.1 $ user.number ~ data4.1 $year.range, var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  data4.1$user.number by data4.1$year.range
## t = -9.2567, df = 121, p-value = 0.0000000000000009507
## alternative hypothesis: true difference in means between group 2015-2016 and group 2017-2018 is not equal to 0
## 95 percent confidence interval:
##  -159.0556 -103.0074
## sample estimates:
## mean in group 2015-2016 mean in group 2017-2018 
##                50.06849               181.10000
#OR

data1$group <- factor(ifelse(data1$month.year <= "2016-12-01", 0,1))
## we are using ifelse function for categorising 
t.test (data1 $ user.number ~ data1$group, var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  data1$user.number by data1$group
## t = -9.2567, df = 121, p-value = 0.0000000000000009507
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -159.0556 -103.0074
## sample estimates:
## mean in group 0 mean in group 1 
##        50.06849       181.10000
#Since p value is less than 0.05, we reject null and go with alternate hypothesis. 
#average number of users in year 2017-2018 is more than the average number of users in year 2015-2016

Q4

Farmers use apps to access information throughout the month. Using the data, check whether app usage is same or different across the four weeks of a month. Anand claims that app usage picked up after January 2016; so, test this hypothesis using data from January-2016 - May 2018.

data5 <- data1 %>% select(month.year, Week, Usage) %>% filter(month.year >= "2016-01-01" & month.year <= "2018-05-01")
# Ho : app usage is same across 4 weeks, 
# Ha: app usage is different across the 4 weeks


anova <- aov(Usage ~ Week,  data = data5)
summary(anova)
##             Df   Sum Sq Mean Sq F value Pr(>F)  
## Week         3  1675404  558468   2.319 0.0804 .
## Residuals   94 22633380  240781                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model.tables(anova, type = "means")
## Tables of means
## Grand mean
##          
## 657.7041 
## 
##  Week 
##     Week1 Week2 Week3 Week4
##     622.2 583.8 541.5 875.8
## rep  25.0  24.0  24.0  25.0
# Since p-value is greater than 0.05, we accept null hypothesis. App usage is same across 4 weeks. Test is insignificant.

Q5

A new version of the app was released in August 2016. Anand wants to understand which month in the given time frame after the launch of the new version, the mean usage pattern would start to show a statistically significant shift.

#first we plot a line graph to see from which month is the shift happening.

data6 <- data1 %>% select(month.year, year, month.name, Usage) %>% filter(month.year >= "2016-08-01")
ggplot(data6)+ aes(x= month.year, y= Usage) + geom_line() + labs(x= "Year", y= "Usage") + scale_x_continuous(labels=as.character(data6$ month.year),breaks= data6$ month.year) + theme(axis.text.x = element_text(angle = 90))

#we can see that from 2016-10-01 there is a shift in usage. We do a test to check whether the shift is significant or not.

#Ho: there is no significant shift in the mean usage pattern across months after the new version is launched
#Ha : there is significant shift in the mean usage pattern across months after the new version is launched


data6$month.group <- factor(ifelse(data6$month.year < "2016-10-01", "before","after"))

#OR

data6 <- data6 %>% mutate(month.group = case_when(month.year < "2016-10-01" ~ "before", month.year >= "2016-10-01" ~ "after"))
t.test (data6$Usage ~ data6$month.group, alternative = "greater", var.equal =  TRUE)
## 
##  Two Sample t-test
## 
## data:  data6$Usage by data6$month.group
## t = 1.9209, df = 68, p-value = 0.02947
## alternative hypothesis: true difference in means between group after and group before is greater than 0
## 95 percent confidence interval:
##  50.62324      Inf
## sample estimates:
##  mean in group after mean in group before 
##             814.4194             430.6250
#Since P value < 0.05, we reject null hypothesis. 
#there is significant shift in the mean usage pattern from 2016-10-01 onwards.

Q6

If a disease is likely to spread in particular weather condition (data given in the disease index sheet), then the access of that disease should be more in the months having suitable weather conditions. Help the analyst in coming up with a statistical test to support the claim for two districts for which the sample of weather and disease access data is provided in the data sheet. Identify the diseases for which you can support this claim. Test this claim both for temperature and relative humidity at 95% confidence.

# import Belgavi weather data

bel_weather <- read_excel("JAT.xlsx", sheet = "Belagavi_weather")

bel_weather <- bel_weather %>% rename(relative.humidity = `Relative Humidity`)

#Calculation for D1
data8.1 <- bel_weather %>% select(D1, `relative.humidity`, Temperature)
#Ho : access of D1 <= in the months having suitable weather conditions
#Ha : access of D1 > in the months having suitable weather conditions

data9.1 <- data8.1 %>% mutate(D1.probability = case_when(relative.humidity > 80 & Temperature >= 20 & Temperature <= 24 ~ "Favourable", relative.humidity <= 80 | Temperature < 20 | Temperature > 24 ~ "Unfavourable"))
# in the above step, we created a new conditional column. If humidity > 80 & temp within 20-24, then probability for accessing D1 would be higher and named it "high", else as "Low"
# now we have to test if `means` of high and low are statistically significant.


t.test(data9.1$D1 ~ data9.1$D1.probability, alternative = "greater", var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  data9.1$D1 by data9.1$D1.probability
## t = 2.7605, df = 22, p-value = 0.005707
## alternative hypothesis: true difference in means between group Favourable and group Unfavourable is greater than 0
## 95 percent confidence interval:
##  9.704442      Inf
## sample estimates:
##   mean in group Favourable mean in group Unfavourable 
##                   37.59305                   11.91669
#Since, p value is < 0.05 (0.005707), we reject null hypothesis. So, access of D1 > in the months having suitable weather conditions

#Calculation for D2
data8.2 <- bel_weather %>% select(D2, `relative.humidity`, Temperature)
data9.2 <- data8.2 %>% mutate(D2.probability = case_when(relative.humidity > 83 & Temperature >= 21.5 & Temperature <= 24.5 ~ "High", relative.humidity <= 83 | Temperature < 21.5 | Temperature > 24.5 ~ "Low"))

#Ho : access of D2 <= in the months having suitable weather conditions
#Ha : access of D2 > in the months having suitable weather conditions

t.test(data9.2$D2 ~ data9.2$D2.probability, alternative = "greater", var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  data9.2$D2 by data9.2$D2.probability
## t = 3.7247, df = 22, p-value = 0.0005887
## alternative hypothesis: true difference in means between group High and group Low is greater than 0
## 95 percent confidence interval:
##  10.89113      Inf
## sample estimates:
## mean in group High  mean in group Low 
##          29.380223           9.173547
#Since, p value is < 0.05 (0.0005887), we reject null hypothesis. So, access of D2 > in the months having suitable weather conditions

#Calculation for D3

data8.3 <- bel_weather %>% select(D3, `relative.humidity`, Temperature)
data9.3 <- data8.3 %>% mutate(D3.probability = case_when( Temperature >= 22 & Temperature <= 24 ~ "High", Temperature < 22 | Temperature > 24 ~ "Low"))

#Ho : access of D3 <= in the months having suitable weather conditions
#Ha : access of D3 > in the months having suitable weather conditions

t.test(data9.3$D3 ~ data9.3$D3.probability, alternative = "greater", var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  data9.3$D3 by data9.3$D3.probability
## t = 2.2224, df = 22, p-value = 0.01843
## alternative hypothesis: true difference in means between group High and group Low is greater than 0
## 95 percent confidence interval:
##  4.39784     Inf
## sample estimates:
## mean in group High  mean in group Low 
##           30.95773           11.61233
#Since, p value is < 0.05 (0.01843), we reject null hypothesis. So, access of D3 > in the months having suitable weather conditions

# Calculation for D4

data8.4 <- bel_weather %>% select(D4, `relative.humidity`, Temperature)
data9.4 <- data8.4 %>% mutate(D4.probability = case_when(relative.humidity > 85 & Temperature >= 22 & Temperature <= 26 ~ "High", relative.humidity <= 85 | Temperature < 22 | Temperature > 26 ~ "Low"))

#Ho : access of D4 <= in the months having suitable weather conditions
#Ha : access of D4 > in the months having suitable weather conditions

t.test(data9.4$D4 ~ data9.4$D4.probability, alternative = "greater", var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  data9.4$D4 by data9.4$D4.probability
## t = 1.793, df = 22, p-value = 0.04337
## alternative hypothesis: true difference in means between group High and group Low is greater than 0
## 95 percent confidence interval:
##  0.4785112       Inf
## sample estimates:
## mean in group High  mean in group Low 
##           24.28984           12.97384
#Since, p value is < 0.05 (0.04337), we reject null hypothesis. So, access of D4 > in the months having suitable weather conditions


#Calculation for D5

data8.5 <- bel_weather %>% select(D5, `relative.humidity`, Temperature)
data9.5 <- data8.5 %>% mutate(D5.probability = case_when(relative.humidity >= 77 & relative.humidity <= 85  & Temperature >= 22 & Temperature <= 24.5 ~ "High", relative.humidity < 77 | relative.humidity > 85 | Temperature < 22 | Temperature > 24.5 ~ "Low"))

#Ho : access of D5 <= in the months having suitable weather conditions
#Ha : access of D5 > in the months having suitable weather conditions

t.test(data9.5$D5 ~ data9.5$D5.probability, alternative = "greater", var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  data9.5$D5 by data9.5$D5.probability
## t = 3.6675, df = 22, p-value = 0.0006761
## alternative hypothesis: true difference in means between group High and group Low is greater than 0
## 95 percent confidence interval:
##  13.85781      Inf
## sample estimates:
## mean in group High  mean in group Low 
##           36.57407           10.51547
#Since, p value is < 0.05 (0.0006761), we reject null hypothesis. So, access of D5 > in the months having suitable weather conditions

# Calculation for D7

data8.7 <- bel_weather %>% select(D7, `relative.humidity`, Temperature)
data9.7 <- data8.7 %>% mutate(D7.probability = case_when(relative.humidity > 80 & Temperature > 25 ~ "High", relative.humidity <= 80 | Temperature <= 25 ~ "Low"))

#Ho : access of D7 <= in the months having suitable weather conditions
#Ha : access of D7 > in the months having suitable weather conditions

t.test(data9.7$D7 ~ data9.7$D7.probability, alternative = "greater", var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  data9.7$D7 by data9.7$D7.probability
## t = 3.4275, df = 22, p-value = 0.001204
## alternative hypothesis: true difference in means between group High and group Low is greater than 0
## 95 percent confidence interval:
##  25.65723      Inf
## sample estimates:
## mean in group High  mean in group Low 
##           72.42328           21.00642
#Since, p value is < 0.05 (0.001204), we reject null hypothesis. So, access of D7 > in the months having suitable weather conditions


# For Dharwad weather
# import Dharwad weather data

dhar_weather <- read_excel("JAT.xlsx", sheet = "Dharwad_weather")

#Calculation for D1

data10.1 <- dhar_weather %>% select(D1, Temperature, `Relative Humidity`)
data11.1 <- data10.1 %>% mutate(D1.probability = case_when(`Relative Humidity` > 80 & Temperature >=20 & Temperature <= 24 ~ "High", `Relative Humidity`<= 80 | Temperature < 20 | Temperature >24 ~ "Low"))
t.test(data11.1 $D1 ~ data11.1 $D1.probability, alternative = "greater", var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  data11.1$D1 by data11.1$D1.probability
## t = 4.5934, df = 20, p-value = 0.00008801
## alternative hypothesis: true difference in means between group High and group Low is greater than 0
## 95 percent confidence interval:
##  15.66022      Inf
## sample estimates:
## mean in group High  mean in group Low 
##          31.590651           6.515126
#Since, p value is < 0.05 (0.00), we reject null hypothesis. So, access of D1 > in the months having suitable weather conditions

#Calculation for D2
data10.2 <- dhar_weather %>% select(D2, Temperature, `Relative Humidity`)
data11.2 <- data10.2 %>% mutate(D2.probability = case_when(`Relative Humidity` > 83 & Temperature >= 21.5 & Temperature <= 24.5 ~ "High", `Relative Humidity` <= 83 | Temperature < 21.5 | Temperature > 24.5 ~ "Low"))

#Ho : access of D2 <= in the months having suitable weather conditions
#Ha : access of D2 > in the months having suitable weather conditions

t.test(data11.2$D2 ~ data11.2$D2.probability, alternative = "greater", var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  data11.2$D2 by data11.2$D2.probability
## t = 4.0726, df = 20, p-value = 0.0002968
## alternative hypothesis: true difference in means between group High and group Low is greater than 0
## 95 percent confidence interval:
##  19.62338      Inf
## sample estimates:
## mean in group High  mean in group Low 
##          40.134921           6.096486
#Since, p value is < 0.05 (0.0002968), we reject null hypothesis. So, access of D2 > in the months having suitable weather conditions


#Calculation for D3

data10.3 <- dhar_weather %>% select(D3, `Relative Humidity`, Temperature)
data11.3 <- data10.3 %>% mutate(D3.probability = case_when( Temperature >= 22 & Temperature <= 24 ~ "High", Temperature < 22 | Temperature > 24 ~ "Low"))

#Ho : access of D3 <= in the months having suitable weather conditions
#Ha : access of D3 > in the months having suitable weather conditions

t.test(data11.3$D3 ~ data11.3$D3.probability, alternative = "greater", var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  data11.3$D3 by data11.3$D3.probability
## t = 1.5057, df = 20, p-value = 0.07389
## alternative hypothesis: true difference in means between group High and group Low is greater than 0
## 95 percent confidence interval:
##  -4.118138       Inf
## sample estimates:
## mean in group High  mean in group Low 
##           40.26971           11.96166
#Since, p value is > 0.05 (0.07389), we cannot reject null hypothesis. So, test is insignificant.


# Calculation for D4

data10.4 <- dhar_weather %>% select(D4, `Relative Humidity`, Temperature)
data11.4 <- data10.4 %>% mutate(D4.probability = case_when(`Relative Humidity` > 85 & Temperature >= 22 & Temperature <= 26 ~ "High", `Relative Humidity` <= 85 | Temperature < 22 | Temperature > 26 ~ "Low"))

#Ho : access of D4 <= in the months having suitable weather conditions
#Ha : access of D4 > in the months having suitable weather conditions

t.test(data11.4$D4 ~ data11.4$D4.probability, alternative = "greater", var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  data11.4$D4 by data11.4$D4.probability
## t = 2.3147, df = 20, p-value = 0.01569
## alternative hypothesis: true difference in means between group High and group Low is greater than 0
## 95 percent confidence interval:
##  6.896259      Inf
## sample estimates:
## mean in group High  mean in group Low 
##           39.16667           12.10875
#Since, p value is < 0.05 (0.01569), we reject null hypothesis. So, access of D4 > in the months having suitable weather conditions


#Calculation for D5

data10.5 <- dhar_weather %>% select(D5, `Relative Humidity`, Temperature)
data11.5 <- data10.5 %>% mutate(D5.probability = case_when(`Relative Humidity` >= 77 & `Relative Humidity` <= 85  & Temperature >= 22 & Temperature <= 24.5 ~ "High", `Relative Humidity` < 77 | `Relative Humidity` > 85 | Temperature < 22 | Temperature > 24.5 ~ "Low"))

#Ho : access of D5 <= in the months having suitable weather conditions
#Ha : access of D5 > in the months having suitable weather conditions

t.test(data11.5$D5 ~ data11.5$D5.probability, alternative = "greater", var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  data11.5$D5 by data11.5$D5.probability
## t = 0.10853, df = 20, p-value = 0.4573
## alternative hypothesis: true difference in means between group High and group Low is greater than 0
## 95 percent confidence interval:
##  -16.53381       Inf
## sample estimates:
## mean in group High  mean in group Low 
##           14.17749           13.06725
#Since, p value is > 0.05 (0.4573), we cannot reject null hypothesis. So, test is insignificant


# Calculation for D7

data10.7 <- dhar_weather %>% select(D7, `Relative Humidity`, Temperature)
data11.7 <- data10.7 %>% mutate(D7.probability = case_when(`Relative Humidity` > 80 & Temperature > 25 ~ "High", `Relative Humidity` <= 80 | Temperature <= 25 ~ "Low"))

#Ho : access of D7 <= in the months having suitable weather conditions
#Ha : access of D7 > in the months having suitable weather conditions

t.test(data11.7$D7 ~ data11.7$D7.probability, alternative = "greater", var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  data11.7$D7 by data11.7$D7.probability
## t = 0.72663, df = 20, p-value = 0.2379
## alternative hypothesis: true difference in means between group High and group Low is greater than 0
## 95 percent confidence interval:
##  -20.73009       Inf
## sample estimates:
## mean in group High  mean in group Low 
##           35.00000           19.90822
#Since, p value is > 0.05 (0.2379), we cannot reject null hypothesis. So, test is insignificant.