#
# Statistics Project 1
#
# Ed Sarausad
# 12/6/2015
#
# TODO: Consider use of FWER, FDR, and associated solutions
#
library(stats)
library(utils)
library(nortest)
# User inference function provide by DataCamp
load(url("http://assets.datacamp.com/course/dasi/inference.Rdata"))
setwd("~/sliderule/statistics_project1/statistics project 1")
df <- read.csv("data/human_body_temperature.csv")
qqnorm(df$temperature)
qqline(df$temperature)

# 1.Is the distribution of body temperatures normal? .Remember that this is a condition for the CLT, and hence the statistical tests we are using, to apply.
#
# First, we'll run tests of null that distribution is normal using normality package, nortest
# NOTE: Doing multiple tests and checking for FWER, FDR etc does not apply since it's the same data and same question (normality)
#
ad.test(df$temperature)
##
## Anderson-Darling normality test
##
## data: df$temperature
## A = 0.5201, p-value = 0.1829
# A = 0.5201, p-value = 0.1829
cvm.test(df$temperature)
##
## Cramer-von Mises normality test
##
## data: df$temperature
## W = 0.081952, p-value = 0.1937
# W = 0.081952, p-value = 0.1937
lillie.test(df$temperature)
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: df$temperature
## D = 0.064727, p-value = 0.2009
# D = 0.064727, p-value = 0.2009
pearson.test(df$temperature)
##
## Pearson chi-square normality test
##
## data: df$temperature
## P = 30.154, p-value = 0.002647
# P = 30.154, p-value = 0.002647
# reject null?
sf.test(df$temperature)
##
## Shapiro-Francia normality test
##
## data: df$temperature
## W = 0.98379, p-value = 0.1113
# W = 0.98379, p-value = 0.1113
# 2.Is the true population mean really 98.6 degrees
# Bring out the one sample hypothesis test! In this situation, is it approriate to apply a z-test or a t-test? How will the result be different?
# Conditions for using z-test is knowing population SD AND sample size > 30. We don't know population SD, so we'll use t-test
# BUT since we think the data is normally distributed (given the qqnorm plot), we can use z-score
inference(df$temperature, est = "mean", method = "theoretical", type = "ht", null=98.6, alternative="twosided", conflevel = 0.95, boot_method = "perc")
## Single mean
## Summary statistics:
## mean = 98.2492 ; sd = 0.7332 ; n = 130
## H0: mu = 98.6
## HA: mu != 98.6
## Standard error = 0.0643
## Test statistic: Z = -5.455
## p-value = 0

# At 95% CI, 98.6 does not fall within the CI
# with low p-value, we can reject null and conclude there is strong evidence that true mean is not 98.6
# 3.At what temperature should we consider someone's temperature to be "abnormal"?.Start by computing the margin of error and confidence interval.
# According to this sample, an abnormal temperature at 99% CI would be less than 98.0836 or greater than 98.4149
inference(df$temperature, est = "mean", method = "theoretical", type = "ci", conflevel = 0.99, boot_method = "perc")
## Single mean
## Summary statistics:

## mean = 98.2492 ; sd = 0.7332 ; n = 130
## Standard error = 0.0643
## 99 % Confidence interval = ( 98.0836 , 98.4149 )
# 4.Is there a significant difference between males and females in normal temperature?.Set up and solve for a two sample hypothesis testing.
# H0 : There is no difference between male and female normal temperatures
# HA : There is significant different between male and female normal temperatures
male <- subset(df, df$gender=="M")
female <- subset(df, df$gender=="F")
#inference(male, female, est = "proportion", method = "theoretical", type = "ht", conflevel = 0.95, boot_method="perc", alternative = "twosided")
# Pooled T-test
t.test(female$temperature, male$temperature, mu=0, var.equal = TRUE)
##
## Two Sample t-test
##
## data: female$temperature and male$temperature
## t = 2.2854, df = 128, p-value = 0.02393
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.03882216 0.53963938
## sample estimates:
## mean of x mean of y
## 98.39385 98.10462
t.test(female$temperature, male$temperature, mu=0)
##
## Welch Two Sample t-test
##
## data: female$temperature and male$temperature
## t = 2.2854, df = 127.51, p-value = 0.02394
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.03881298 0.53964856
## sample estimates:
## mean of x mean of y
## 98.39385 98.10462
# Using a two-sided test
t.test(male$temperature, female$temperature, mu=0, var.equal = TRUE)
##
## Two Sample t-test
##
## data: male$temperature and female$temperature
## t = -2.2854, df = 128, p-value = 0.02393
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.53963938 -0.03882216
## sample estimates:
## mean of x mean of y
## 98.10462 98.39385
t.test(male$temperature, female$temperature, mu=0)
##
## Welch Two Sample t-test
##
## data: male$temperature and female$temperature
## t = -2.2854, df = 127.51, p-value = 0.02394
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.53964856 -0.03881298
## sample estimates:
## mean of x mean of y
## 98.10462 98.39385