## Warning: package 'maps' was built under R version 3.6.3
## Warning: package 'broom' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
summary(farms)
## FarmID Region STEC Cows Bedding
## Min. : 1.0 Northland:102 Negative:576 Min. : 35.0 Bark :321
## 1st Qu.:200.8 Southland:104 Positive:224 1st Qu.: 189.0 Straw:479
## Median :400.5 Taranaki :180 Median : 319.0
## Mean :400.5 Waikato :414 Mean : 391.5
## 3rd Qu.:600.2 3rd Qu.: 522.2
## Max. :800.0 Max. :1872.0
## Longitude Latitude
## Min. :167.7 Min. :-46.53
## 1st Qu.:174.1 1st Qu.:-39.38
## Median :174.8 Median :-37.94
## Mean :174.1 Mean :-38.95
## 3rd Qu.:175.5 3rd Qu.:-37.48
## Max. :176.4 Max. :-34.93
ggplot(farms) + geom_bar(aes(x=Region, fill=Region))+ ggtitle("Sampled Farms by Region")
The total number of sampled farm in each region were as follows Northland:102 Southland:104 Taranaki :180 Waikato :414
ggplot() + geom_polygon(data=nz, aes(x=long, y=lat, group=group), fill="pink", colour="green") +
geom_point(data=farms, aes(x=Longitude, y=Latitude, colour=Region)) + ggtitle("Regions Sampled in NZ") + coord_fixed(ratio=1.3) + xlab("Longitude") + ylab("Latitude")
only_farms= subset(farms, Longitude>175)
By running the code above R-Studio omits longitude greater than 175 degrees that is coded in the taranaki region. Farm ID 756
ggplot(farms) + geom_boxplot(aes(x=Region, y=Cows, fill=Region)) + ggtitle("Farm Size vs Region")
The data shows a larger number of cows in the the Waikato Region. The southland has a higher median amount of cows per farm. The Waikato Region has the largest range.
Taranaki = subset(farms, Region == "Taranaki")
Waikato = subset(farms, Region == "Waikato")
Northland = subset(farms, Region == "Northland")
Southland = subset(farms, Region == "Southland")
t.test(Waikato$Cows)
##
## One Sample t-test
##
## data: Waikato$Cows
## t = 27.225, df = 413, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 363.286 419.830
## sample estimates:
## mean of x
## 391.558
t.test(Northland$Cows)
##
## One Sample t-test
##
## data: Northland$Cows
## t = 14.594, df = 101, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 315.5726 414.8588
## sample estimates:
## mean of x
## 365.2157
t.test(Southland$Cows)
##
## One Sample t-test
##
## data: Southland$Cows
## t = 16.495, df = 103, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 429.6561 547.0939
## sample estimates:
## mean of x
## 488.375
t.test(Taranaki$Cows)
##
## One Sample t-test
##
## data: Taranaki$Cows
## t = 18.871, df = 179, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 313.4660 386.6784
## sample estimates:
## mean of x
## 350.0722
ttests = tapply(farms$Cows, farms$Region, t.test)
tidied = lapply(ttests, broom::tidy)
combined = dplyr::bind_rows(tidied, .id = "Region")
combined
## # A tibble: 4 x 9
## Region estimate statistic p.value parameter conf.low conf.high method
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 North~ 365. 14.6 1.27e-26 101 316. 415. One S~
## 2 South~ 488. 16.5 1.14e-30 103 430. 547. One S~
## 3 Taran~ 350. 18.9 1.98e-44 179 313. 387. One S~
## 4 Waika~ 392. 27.2 3.34e-94 413 363. 420. One S~
## # ... with 1 more variable: alternative <chr>
ggplot(combined) + geom_point(aes(x=Region, y=estimate), size=4)+ geom_errorbar(aes(x=Region, ymin=conf.low, ymax=conf.high),width=0.1) + ylab("Number of Cows") + xlab("Region") + ggtitle("Number of Cows per Farm (Including Uncertainty)")
Conclusion: Data extrapulated from the waikato region showed the broadest range in amount of cows per farm but the Southland showed a larger confidence interval range which shows the southland has a larger uncertianty
ggplot(farms) + geom_bar(aes(x=Region, fill=Bedding),position='dodge') + ggtitle("Bedding Type Used per Region") + ylab("Amount of Farms")
ggplot(farms) + geom_bar(aes(x=Region, fill=Bedding),position='fill') + ggtitle("Bedding Type Used per Region")
tab = table(farms$Region, farms$Bedding)
tab
##
## Bark Straw
## Northland 45 57
## Southland 18 86
## Taranaki 84 96
## Waikato 174 240
prop.table(tab, margin = 1)
##
## Bark Straw
## Northland 0.4411765 0.5588235
## Southland 0.1730769 0.8269231
## Taranaki 0.4666667 0.5333333
## Waikato 0.4202899 0.5797101
tab = table(farms$Region, farms$Bedding)
prop.test(tab)
##
## 4-sample test for equality of proportions without continuity
## correction
##
## data: tab
## X-squared = 27.045, df = 3, p-value = 5.761e-06
## alternative hypothesis: two.sided
## sample estimates:
## prop 1 prop 2 prop 3 prop 4
## 0.4411765 0.1730769 0.4666667 0.4202899
We can hypothesis that there would be no difference in porportionsin bedding per region. A small p-value (typically ≤ 0.05) indicates strong evidence against the null hypothesis, so you reject the null hypothesis. So we can inffer that there is no difference between regions in regards to bedding
ggplot(farms, aes(x=STEC, y=Cows)) + geom_boxplot() + ylab("Cows")
There is some suggestion that Positive STEC heards are larger than Negative STEC in the boxplot, but there is quite a bit of overlap. To assess whether the difference we see in the sample is also present in the population. We need to find a confidence interval for Positive and Negative, then see if they overlap. Alternatively we can calculate a confidence interval for the difference in mean herd size between positive/negative if it doesn’t contain 0, we’d conclude that there is a difference.
ttests=tapply(farms$Cows, farms$STEC, t.test)
tidied=lapply(ttests, broom::tidy)
combined=dplyr::bind_rows(tidied, .id='STEC')
ggplot(combined, aes(x=STEC, y=estimate, ymin=conf.low, ymax=conf.high)) +
geom_errorbar(width=0.1) + geom_point(size=4) + ylab("Mean Cows")
t.test(Cows ~ STEC, data=farms)
##
## Welch Two Sample t-test
##
## data: Cows by STEC
## t = -5.5039, df = 329.58, p-value = 7.478e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -182.19207 -86.24692
## sample estimates:
## mean in group Negative mean in group Positive
## 353.8698 488.0893
A 95% confidence interval has Larger heards testing positive Than smaller herds This is inconsistent with a difference of 0 in the population, so we’d conclude our data suggests there is a difference, in STEC results in corrilation with heard size
ggplot(farms) + geom_bar(aes(x=Region, fill=STEC), position= "dodge") + ggtitle("STEC VS Region")
tab = table(farms$Region, farms$STEC)
tab
##
## Negative Positive
## Northland 76 26
## Southland 60 44
## Taranaki 146 34
## Waikato 294 120
prop.table(tab, margin = 1)
##
## Negative Positive
## Northland 0.7450980 0.2549020
## Southland 0.5769231 0.4230769
## Taranaki 0.8111111 0.1888889
## Waikato 0.7101449 0.2898551
tab = table(farms$Region, farms$STEC)
prop.test(tab)
##
## 4-sample test for equality of proportions without continuity
## correction
##
## data: tab
## X-squared = 18.49, df = 3, p-value = 0.0003484
## alternative hypothesis: two.sided
## sample estimates:
## prop 1 prop 2 prop 3 prop 4
## 0.7450980 0.5769231 0.8111111 0.7101449
Conclusions: The majority of cows test negative for STEC we can conclude that the northland has a higher amouht of positive results. ### 3. STEC versus bedding
ggplot(farms) + geom_bar(aes(x=Bedding, fill=STEC), position= "dodge") + ggtitle("STEC VS Bedding")
tab = table(farms$Bedding, farms$STEC)
tab
##
## Negative Positive
## Bark 244 77
## Straw 332 147
prop.table(tab, margin = 1)
##
## Negative Positive
## Bark 0.7601246 0.2398754
## Straw 0.6931106 0.3068894
tab = table(farms$Bedding, farms$STEC)
prop.test(tab)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: tab
## X-squared = 3.9555, df = 1, p-value = 0.04672
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.002059453 0.131968474
## sample estimates:
## prop 1 prop 2
## 0.7601246 0.6931106
Conclusion= According to the pvalue we can infer that there may be a corrilation in regards to bedding and STEC results.
Results indicate differences in STEC prevalence observed across regions in New Zealand are not associated with the region but more closely associated with bedding used and farm size. When comparing STEC results across the regions the majority of cows test negative for STEC. We can conclude that the northland has a higher amount of positive results. However our p-value is less than 0.05 at 0.0003484 indicating no correlation. Results for bedding used by region conclude that straw is preferred in the Waikato and Southland. Both of which have larger farm sizes. We then must ask ourselves if there is a correlation between STEC and bedding or STEC and Farm Size and which has stronger evidence. Next we look results for STEC vs Bedding which indicates Straw bedding is associated with negative STEC results the p-value = 0.04672 is less than 0.05 but it is so close that we can assume that there may be a correlation in regards to STEC and bedding used. STEC vs Farm size results indicate that larger farms are more prevalent to positive STEC results. After running a Welch Two Sample t-test. A 95% confidence interval has larger farms testing positive Than smaller farms. This is inconsistent with a difference of 0 in the population, so we’d conclude our data suggests there is a difference, in STEC results in correlation with farm size. However we do have a large confidence interval so we must be cautious. In conclusion regions with larger farms tend to have an increase in positive STEC results that is also associated in regard to the type of bedding they use. There is a stronger association for farm size than with bedding used. As straw bedding was preferred in regions that coincidently had larger farm sizes.