setwd("C:/_MyData_/IIMK/Assignment 1")
library (readxl)
library (dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
agriData_org <- read_excel ("IMB733-XLS-ENG Spreadsheet 3.xlsx", sheet = "Data Sheet") #Load the data
agriData <- agriData_org #Backup the data frame just in case we need it later
str (agriData) #See the structure of the data frame
## tibble [123 x 26] (S3: tbl_df/tbl/data.frame)
## $ Month-Year : POSIXct[1:123], format: "2015-06-01" "2015-07-01" ...
## $ Week : chr [1:123] "Week4" "Week1" "Week2" "Week3" ...
## $ No of users : num [1:123] 2 1 1 4 6 12 13 10 7 12 ...
## $ Usage : num [1:123] 4 1 25 70 100 291 225 141 148 215 ...
## $ D1 : num [1:123] 0 0 0 4 1 12 7 4 1 5 ...
## $ D2 : num [1:123] 0 0 1 2 1 6 5 4 0 3 ...
## $ D3 : num [1:123] 1 0 2 3 0 11 6 8 1 6 ...
## $ D4 : num [1:123] 0 0 2 4 2 4 2 4 5 3 ...
## $ D5 : num [1:123] 0 0 0 2 0 5 5 3 4 6 ...
## $ D6 : num [1:123] 0 0 0 4 2 15 6 5 5 12 ...
## $ D7 : num [1:123] 0 0 2 1 7 7 4 8 3 7 ...
## $ D8 : num [1:123] 0 0 3 7 9 7 5 7 3 33 ...
## $ D9 : num [1:123] 0 0 2 3 2 6 6 6 3 11 ...
## $ D10 : num [1:123] 0 0 1 0 1 10 1 4 2 3 ...
## $ D11 : num [1:123] 0 0 1 3 4 12 6 3 5 8 ...
## $ V1 : num [1:123] 0 0 0 5 11 26 28 18 18 20 ...
## $ V2 : num [1:123] 0 1 0 4 8 20 19 11 16 15 ...
## $ V3 : num [1:123] 0 0 1 2 5 12 13 8 7 10 ...
## $ V4 : num [1:123] 0 0 1 2 5 13 8 7 6 6 ...
## $ V5 : num [1:123] 2 0 1 1 9 16 9 4 15 6 ...
## $ V6 : num [1:123] 0 0 1 3 3 22 14 4 10 10 ...
## $ V7 : num [1:123] 0 0 0 3 7 21 13 5 4 9 ...
## $ V8 : num [1:123] 0 0 0 2 6 14 9 5 10 7 ...
## $ V9 : num [1:123] 0 0 0 4 7 16 20 10 9 8 ...
## $ V10 : num [1:123] 0 0 1 4 7 23 17 5 14 10 ...
## $ Micronutrient: num [1:123] 1 0 6 7 3 13 22 8 7 17 ...
as.Date (agriData$`Month-Year`, "%Y-%m-%d") #Convert the Month-Year field into Date data type
## Warning in as.POSIXlt.POSIXct(x, tz = tz): unknown timezone '%Y-%m-%d'
## [1] "2015-06-01" "2015-07-01" "2015-07-01" "2015-07-01" "2015-07-01"
## [6] "2015-08-01" "2015-08-01" "2015-08-01" "2015-08-01" "2015-09-01"
## [11] "2015-09-01" "2015-09-01" "2015-09-01" "2015-10-01" "2015-10-01"
## [16] "2015-10-01" "2015-10-01" "2015-11-01" "2015-11-01" "2015-11-01"
## [21] "2015-11-01" "2015-12-01" "2015-12-01" "2015-12-01" "2015-12-01"
## [26] "2016-01-01" "2016-01-01" "2016-01-01" "2016-01-01" "2016-02-01"
## [31] "2016-02-01" "2016-02-01" "2016-02-01" "2016-03-01" "2016-03-01"
## [36] "2016-03-01" "2016-03-01" "2016-04-01" "2016-04-01" "2016-04-01"
## [41] "2016-04-01" "2016-05-01" "2016-05-01" "2016-05-01" "2016-05-01"
## [46] "2016-06-01" "2016-06-01" "2016-06-01" "2016-06-01" "2016-07-01"
## [51] "2016-07-01" "2016-07-01" "2016-07-01" "2016-08-01" "2016-08-01"
## [56] "2016-08-01" "2016-08-01" "2016-09-01" "2016-09-01" "2016-09-01"
## [61] "2016-09-01" "2016-10-01" "2016-10-01" "2016-10-01" "2016-10-01"
## [66] "2016-11-01" "2016-11-01" "2016-11-01" "2016-11-01" "2016-12-01"
## [71] "2016-12-01" "2016-12-01" "2016-12-01" "2017-01-01" "2017-01-01"
## [76] "2017-01-01" "2017-01-01" "2017-02-01" "2017-02-01" "2017-02-01"
## [81] "2017-02-01" "2017-03-01" "2017-03-01" "2017-03-01" "2017-03-01"
## [86] "2017-04-01" "2017-04-01" "2017-04-01" "2017-04-01" "2017-05-01"
## [91] "2017-05-01" "2017-05-01" "2017-05-01" "2017-09-01" "2017-10-01"
## [96] "2017-10-01" "2017-10-01" "2017-10-01" "2017-11-01" "2017-11-01"
## [101] "2017-11-01" "2017-11-01" "2017-12-01" "2017-12-01" "2017-12-01"
## [106] "2017-12-01" "2018-01-01" "2018-01-01" "2018-01-01" "2018-01-01"
## [111] "2018-02-01" "2018-02-01" "2018-02-01" "2018-02-01" "2018-03-01"
## [116] "2018-03-01" "2018-03-01" "2018-03-01" "2018-04-01" "2018-04-01"
## [121] "2018-04-01" "2018-04-01" "2018-05-01"
case1Data <- subset (agriData, agriData$`Month-Year` >= "2017-10-01") #Get the data from Oct 2017
case1Test <- t.test (case1Data$D6, mu = 60, alernative = "greater") #Do the test
case1Test #Display test result
##
## One Sample t-test
##
## data: case1Data$D6
## t = 2.341, df = 28, p-value = 0.02658
## alternative hypothesis: true mean is not equal to 60
## 95 percent confidence interval:
## 61.05162 75.77597
## sample estimates:
## mean of x
## 68.41379
case2Data <- agriData
case2Data$totalDiseaseAccess <- case2Data$D1 + case2Data$D2 + case2Data$D3 + case2Data$D4 + case2Data$D5 + case2Data$D6 + case2Data$D7 + case2Data$D8 + case2Data$D9 + case2Data$D10 + case2Data$D11 #Get the total of disease access information
case2Test <- prop.test(x = sum(case2Data$D6), n = sum(case2Data$totalDiseaseAccess), p = 0.15, alternative = "greater")
case2Test
##
## 1-sample proportions test with continuity correction
##
## data: sum(case2Data$D6) out of sum(case2Data$totalDiseaseAccess), null probability 0.15
## X-squared = 21.311, df = 1, p-value = 1.953e-06
## alternative hypothesis: true p is greater than 0.15
## 95 percent confidence interval:
## 0.1564156 1.0000000
## sample estimates:
## p
## 0.160082
case3Data <- agriData
case3Data$yearGroup = "Year" #Add a column to segregate the data into two groups
case3Data$yearGroup[case3Data$`Month-Year` >= "2015-01-01" & case3Data$`Month-Year` <= "2016-12-31"] = "2015-16" #Set the flag for 2015-16 observations
case3Data$yearGroup[case3Data$`Month-Year` >= "2017-01-01" & case3Data$`Month-Year` <= "2018-12-31"] = "2017-18" #Set the flag for 2017-18 observations
case3Test <- t.test (case3Data$`No of users` ~ case3Data$yearGroup, alternative = "less", var.equal = TRUE) #Do the test
case3Test #Display test result
##
## Two Sample t-test
##
## data: case3Data$`No of users` by case3Data$yearGroup
## t = -9.2567, df = 121, p-value = 4.753e-16
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -107.5685
## sample estimates:
## mean in group 2015-16 mean in group 2017-18
## 50.06849 181.10000
u1 = Average usage in 1st week of every month
u2 = Average usage in 2nd week of every month
u3 = Average usage in 3rd week of every month
u4 = Average usage in 4th week of every month
H0: u1 = u2 = u3 = u4
Ha: Not all u are equal (At least two u are different)
case4aData <- data.frame(Usage = agriData$Usage, Week = agriData$Week) #Create a data frame only with Usage and Week data
case4aData$Week <- factor (case4aData$Week) #Convert Week to categorical variable
agriAnova <- aov(Usage~Week, data = case4aData) #Do the test
summary (agriAnova) #Display summary of the test
## Df Sum Sq Mean Sq F value Pr(>F)
## Week 3 1515178 505059 2.22 0.0894 .
## Residuals 119 27074553 227517
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model.tables (agriAnova, type = "means") #Check the means by week
## Tables of means
## Grand mean
##
## 582.4959
##
## Week
## Week1 Week2 Week3 Week4
## 551.9 522.2 480 764.7
## rep 31.0 30.0 30 32.0
case4bData <- agriData
case4bData$yearGroup = "Year" #Add a column to segregate the data into two year groups
case4bData$yearGroup[case4bData$`Month-Year` <= "2015-12-31"] = "Till_2015"
case4bData$yearGroup[case4bData$`Month-Year` >= "2016-01-01"] = "From_2016"
case4Test <- t.test (case4bData$Usage ~ case4bData$yearGroup, alternative = "greater", var.equal = TRUE) #Do the test
case4Test #Display test result
##
## Two Sample t-test
##
## data: case4bData$Usage by case4bData$yearGroup
## t = 3.5721, df = 121, p-value = 0.0002547
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 198.3213 Inf
## sample estimates:
## mean in group From_2016 mean in group Till_2015
## 657.7041 287.6800
case5Data <- subset (agriData, agriData$`Month-Year` >= "2016-05-01") #We have data till May 2018. So select data from May 2016
groupByMonthCase5 <- group_by (case5Data, `Month-Year`) #Group the data by month
case5Summary <- summarise(groupByMonthCase5, Users = sum(`No of users`), Usage = sum(Usage)) #Summarize the data by total users and total usage in the month
## `summarise()` ungrouping output (override with `.groups` argument)
cor.test (case5Summary$Users, case5Summary$Usage) #Do the test
##
## Pearson's product-moment correlation
##
## data: case5Summary$Users and case5Summary$Usage
## t = 9.1887, df = 20, p-value = 1.287e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7691185 0.9577074
## sample estimates:
## cor
## 0.8991594
case6Data <- subset (agriData, agriData$`Month-Year` >= "2016-09-01") #Extract data from Sep 2016
groupByMonthCase6 <- group_by (case6Data, `Month-Year`) #Group the data by month
case6Summary <- summarise (groupByMonthCase6, Usage = mean(Usage)) #Summarize the data by mean usage per month
## `summarise()` ungrouping output (override with `.groups` argument)
case6Summary #Display summary
## # A tibble: 18 x 2
## `Month-Year` Usage
## <dttm> <dbl>
## 1 2016-09-01 00:00:00 396.
## 2 2016-10-01 00:00:00 1215
## 3 2016-11-01 00:00:00 751
## 4 2016-12-01 00:00:00 496.
## 5 2017-01-01 00:00:00 442.
## 6 2017-02-01 00:00:00 429.
## 7 2017-03-01 00:00:00 681.
## 8 2017-04-01 00:00:00 356.
## 9 2017-05-01 00:00:00 203
## 10 2017-09-01 00:00:00 1923
## 11 2017-10-01 00:00:00 1398.
## 12 2017-11-01 00:00:00 1300.
## 13 2017-12-01 00:00:00 1446.
## 14 2018-01-01 00:00:00 1123.
## 15 2018-02-01 00:00:00 718
## 16 2018-03-01 00:00:00 865.
## 17 2018-04-01 00:00:00 623.
## 18 2018-05-01 00:00:00 380
plot (case6Summary$`Month-Year`, case6Summary$Usage, type="o", col = "red", xlab = "Year", ylab = "Average Usage",
main = "Line Graph for Average Usage by Month") #Draw line graph
H0: A disease information should be accessed similarly irrespective of weather condition being favorable or unfavorable to that disease
Ha: A disease information should be accessed more when the weather condition is favorable to that disease
Weather condition is determined based on temperature and humidity
u1 = Mean access of the disease when the weather condition is favorable to that disease
u2 = Mean access of the disease when the weather condition is unfavorable to that disease
H0: u1 = u2
Ha: u1 > u2 or u1 < u2
belagaviAgriData <- read_excel ("IMB733-XLS-ENG Spreadsheet 3.xlsx", sheet = "Belagavi_weather") #Load the data of Belagavi district
dharwadAgriData <- read_excel ("IMB733-XLS-ENG Spreadsheet 3.xlsx", sheet = "Dharwad_weather") #Load the data of Dharwad district
belagaviD1Data <- belagaviAgriData
belagaviD1Data$D1Favorable = "No" #Initialize the D1 Favorable flag with No
belagaviD1Data$D1Favorable[belagaviD1Data$Temperature >= 20 & belagaviD1Data$Temperature <= 24 & belagaviD1Data$`Relative Humidity` > 80] = "Yes" #Set the flag to Yes for the observations where weather is favorable
belagaviD1Test <- t.test (D1 ~ D1Favorable, data = belagaviD1Data, alternative = "two.sided", var.equal = TRUE) #Do the test
belagaviD1Test #Display test result
##
## Two Sample t-test
##
## data: D1 by D1Favorable
## t = -2.7605, df = 22, p-value = 0.01141
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -44.966364 -6.386351
## sample estimates:
## mean in group No mean in group Yes
## 11.91669 37.59305
dharwadD1Data <- dharwadAgriData
dharwadD1Data$D1Favorable = "No" #Initialize the D1 Favorable flag with No
dharwadD1Data$D1Favorable[dharwadD1Data$Temperature >= 20 & dharwadD1Data$Temperature <= 24 & dharwadD1Data$`Relative Humidity` > 80] = "Yes" #Set the flag to Yes for the observations where weather is favorable
dharwadD1Test <- t.test (D1 ~ D1Favorable, data = dharwadD1Data, alternative = "two.sided", var.equal = TRUE)
dharwadD1Test
##
## Two Sample t-test
##
## data: D1 by D1Favorable
## t = -4.5934, df = 20, p-value = 0.000176
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -36.46288 -13.68817
## sample estimates:
## mean in group No mean in group Yes
## 6.515126 31.590651
belagaviD2Data <- belagaviAgriData
belagaviD2Data$D2Favorable = "No"
belagaviD2Data$D2Favorable[belagaviD2Data$Temperature >= 21.5 & belagaviD2Data$Temperature <= 24.5 & belagaviD2Data$`Relative Humidity` > 83] = "Yes"
belagaviD2Test <- t.test (D2 ~ D2Favorable, data = belagaviD2Data, alternative = "two.sided", var.equal = TRUE)
belagaviD2Test
##
## Two Sample t-test
##
## data: D2 by D2Favorable
## t = -3.7247, df = 22, p-value = 0.001177
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -31.457485 -8.955867
## sample estimates:
## mean in group No mean in group Yes
## 9.173547 29.380223
dharwadD2Data <- dharwadAgriData
dharwadD2Data$D2Favorable = "No"
dharwadD2Data$D2Favorable[dharwadD2Data$Temperature >= 21.5 & dharwadD2Data$Temperature <= 24.5 & dharwadD2Data$`Relative Humidity` > 83] = "Yes"
dharwadD2Test <- t.test (D2 ~ D2Favorable, data = dharwadD2Data, alternative = "two.sided", var.equal = TRUE)
dharwadD2Test
##
## Two Sample t-test
##
## data: D2 by D2Favorable
## t = -4.0726, df = 20, p-value = 0.0005937
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -51.47275 -16.60412
## sample estimates:
## mean in group No mean in group Yes
## 6.096486 40.134921
belagaviD3Data <- belagaviAgriData
belagaviD3Data$D3Favorable = "No"
belagaviD3Data$D3Favorable[belagaviD3Data$Temperature >= 22 & belagaviD3Data$Temperature <= 24] = "Yes"
belagaviD3Test <- t.test (D3 ~ D3Favorable, data = belagaviD3Data, alternative = "two.sided", var.equal = TRUE)
belagaviD3Test
##
## Two Sample t-test
##
## data: D3 by D3Favorable
## t = -2.2224, df = 22, p-value = 0.03685
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -37.398250 -1.292554
## sample estimates:
## mean in group No mean in group Yes
## 11.61233 30.95773
dharwadD3Data <- dharwadAgriData
dharwadD3Data$D3Favorable = "No"
dharwadD3Data$D3Favorable[dharwadD3Data$Temperature >= 22 & dharwadD3Data$Temperature <= 24] = "Yes"
dharwadD3Test <- t.test (D3 ~ D3Favorable, data = dharwadD3Data, alternative = "two.sided", var.equal = TRUE)
dharwadD3Test
##
## Two Sample t-test
##
## data: D3 by D3Favorable
## t = -1.5057, df = 20, p-value = 0.1478
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -67.52597 10.90986
## sample estimates:
## mean in group No mean in group Yes
## 11.96166 40.26971
belagaviD4Data <- belagaviAgriData
belagaviD4Data$D4Favorable = "No"
belagaviD4Data$D4Favorable[belagaviD4Data$Temperature >= 22 & belagaviD4Data$Temperature <= 26 & belagaviD4Data$`Relative Humidity` > 85] = "Yes"
belagaviD4Test <- t.test (D4 ~ D4Favorable, data = belagaviD4Data, alternative = "two.sided", var.equal = TRUE)
belagaviD4Test
##
## Two Sample t-test
##
## data: D4 by D4Favorable
## t = -1.793, df = 22, p-value = 0.08674
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -24.404931 1.772927
## sample estimates:
## mean in group No mean in group Yes
## 12.97384 24.28984
dharwadD4Data <- dharwadAgriData
dharwadD4Data$D4Favorable = "No"
dharwadD4Data$D4Favorable[dharwadD4Data$Temperature >= 22 & dharwadD4Data$Temperature <= 26 & dharwadD4Data$`Relative Humidity` > 85] = "Yes"
dharwadD4Test <- t.test (D4 ~ D4Favorable, data = dharwadD4Data, alternative = "two.sided", var.equal = TRUE)
dharwadD4Test
##
## Two Sample t-test
##
## data: D4 by D4Favorable
## t = -2.3147, df = 20, p-value = 0.03138
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -51.442463 -2.673366
## sample estimates:
## mean in group No mean in group Yes
## 12.10875 39.16667
belagaviD5Data <- belagaviAgriData
belagaviD5Data$D5Favorable = "No"
belagaviD5Data$D5Favorable[belagaviD5Data$Temperature >= 22 & belagaviD5Data$Temperature <= 24.5 & belagaviD5Data$`Relative Humidity` >= 77 & belagaviD5Data$`Relative Humidity` <= 85] = "Yes"
belagaviD5Test <- t.test (D5 ~ D5Favorable, data = belagaviD5Data, alternative = "two.sided", var.equal = TRUE)
belagaviD5Test
##
## Two Sample t-test
##
## data: D5 by D5Favorable
## t = -3.6675, df = 22, p-value = 0.001352
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -40.79405 -11.32315
## sample estimates:
## mean in group No mean in group Yes
## 10.51547 36.57407
dharwadD5Data <- dharwadAgriData
dharwadD5Data$D5Favorable = "No"
dharwadD5Data$D5Favorable[dharwadD5Data$Temperature >= 22 & dharwadD5Data$Temperature <= 24.5 & dharwadD5Data$`Relative Humidity` >= 77 & dharwadD5Data$`Relative Humidity` <= 85] = "Yes"
dharwadD5Test <- t.test (D5 ~ D5Favorable, data = dharwadD5Data, alternative = "two.sided", var.equal = TRUE)
dharwadD5Test
##
## Two Sample t-test
##
## data: D5 by D5Favorable
## t = -0.10853, df = 20, p-value = 0.9147
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -22.44986 20.22939
## sample estimates:
## mean in group No mean in group Yes
## 13.06725 14.17749
belagaviD7Data <- belagaviAgriData
belagaviD7Data$D7Favorable = "No"
belagaviD7Data$D7Favorable[belagaviD7Data$Temperature > 25 & belagaviD7Data$`Relative Humidity` > 80] = "Yes"
belagaviD7Test <- t.test (D7 ~ D7Favorable, data = belagaviD7Data, alternative = "two.sided", var.equal = TRUE)
belagaviD7Test
##
## Two Sample t-test
##
## data: D7 by D7Favorable
## t = -3.4275, df = 22, p-value = 0.002408
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -82.52794 -20.30579
## sample estimates:
## mean in group No mean in group Yes
## 21.00642 72.42328
dharwadD7Data <- dharwadAgriData
dharwadD7Data$D7Favorable = "No"
dharwadD7Data$D7Favorable[dharwadD7Data$Temperature > 25 & dharwadD7Data$`Relative Humidity` > 80] = "Yes"
dharwadD7Test <- t.test (D7 ~ D7Favorable, data = dharwadD7Data, alternative = "two.sided", var.equal = TRUE)
dharwadD7Test
##
## Two Sample t-test
##
## data: D7 by D7Favorable
## t = -0.72663, df = 20, p-value = 0.4759
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -58.41659 28.23304
## sample estimates:
## mean in group No mean in group Yes
## 19.90822 35.00000
| Case | Hypothesis Accepted | Conclusion |
|---|---|---|
| 1 | Alternative | With 95% confidence we can conclude that information pertaining to disease 6 is accessed at least 60 times every week since Oct 2017. |
| 2 | Alternative | With 95% confidence we can conclude that information related to disease 6 is accessed by at least 15% among all the app users for disease information. |
| 3 | Alternative | With 95% confidence we can conclude that average number of app users is higher in 2017-18 than in 2015-16. |
| 4a | Null | With 5% error we can conclude that app usage is almost same across the four weeks of a month. |
| 4b | Alternative | With 95% confidence we can conclude that app usage picked up after Jan 2016. |
| 5 | Alternative | With 95% confidence we can conclude that the app usage has increased with the number of users in last 2 years. |
| 6 | Not Applicable | As we can see from the data and the graph, usage Pattern started to show statistically significant shift from Sep 2017. |
| Disease | Belagavi | Dharwad |
|---|---|---|
| D1 | Alternative. Accessed more in favorabe weather condition. | Alternative. Accessed more in favorabe weather condition. |
| D2 | Alternative. Accessed more in favorabe weather condition. | Alternative. Accessed more in favorabe weather condition. |
| D3 | Alternative. Accessed more in favorabe weather condition. | Null. Accessed similarly irrespective of weather condition being favorable or not. |
| D4 | Null. Accessed similarly irrespective of weather condition being favorable or not. | Alternative. Accessed more in favorabe weather condition. |
| D5 | Alternative. Accessed more in favorabe weather condition. | Null. Accessed similarly irrespective of weather condition being favorable or not. |
| D7 | Alternative. Accessed more in favorabe weather condition. | Null. Accessed similarly irrespective of weather condition being favorable or not. |