answers.R

#
# Statistics Project 1
#
# Ed Sarausad
# 12/6/2015
#
# TODO: Consider use of FWER, FDR, and associated solutions
#
library(stats)
library(utils)
library(nortest)

# User inference function provide by DataCamp
load(url("http://assets.datacamp.com/course/dasi/inference.Rdata"))
setwd("~/sliderule/statistics_project1/statistics project 1")

df <- read.csv("data/human_body_temperature.csv")

qqnorm(df$temperature)
qqline(df$temperature)

# 1.Is the distribution of body temperatures normal? .Remember that this is a condition for the CLT, and hence the statistical tests we are using, to apply. 
#
# First, we'll run tests of null that distribution is normal using normality package, nortest
# NOTE: Doing multiple tests and checking for FWER, FDR etc does not apply since it's the same data and same question (normality)
#
ad.test(df$temperature)

## 
##  Anderson-Darling normality test
## 
## data:  df$temperature
## A = 0.5201, p-value = 0.1829

# A = 0.5201, p-value = 0.1829

cvm.test(df$temperature)

## 
##  Cramer-von Mises normality test
## 
## data:  df$temperature
## W = 0.081952, p-value = 0.1937

# W = 0.081952, p-value = 0.1937

lillie.test(df$temperature)

## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  df$temperature
## D = 0.064727, p-value = 0.2009

# D = 0.064727, p-value = 0.2009

pearson.test(df$temperature)

## 
##  Pearson chi-square normality test
## 
## data:  df$temperature
## P = 30.154, p-value = 0.002647

# P = 30.154, p-value = 0.002647
# reject null?

sf.test(df$temperature)

## 
##  Shapiro-Francia normality test
## 
## data:  df$temperature
## W = 0.98379, p-value = 0.1113

# W = 0.98379, p-value = 0.1113

# 2.Is the true population mean really 98.6 degrees
# Bring out the one sample hypothesis test! In this situation, is it approriate to apply a z-test or a t-test? How will the result be different?
# Conditions for using z-test is knowing population SD AND sample size > 30.  We don't know population SD, so we'll use t-test
# BUT since we think the data is normally distributed (given the qqnorm plot), we can use z-score
inference(df$temperature, est = "mean", method = "theoretical", type = "ht", null=98.6, alternative="twosided", conflevel = 0.95, boot_method = "perc")

## Single mean 
## Summary statistics:

## mean = 98.2492 ;  sd = 0.7332 ;  n = 130 
## H0: mu = 98.6 
## HA: mu != 98.6 
## Standard error = 0.0643 
## Test statistic: Z = -5.455 
## p-value =  0

# At 95% CI, 98.6 does not fall within the CI
# with low p-value, we can reject null and conclude there is strong evidence that true mean is not 98.6

# 3.At what temperature should we consider someone's temperature to be "abnormal"?.Start by computing the margin of error and confidence interval.
# According to this sample, an abnormal temperature at 99% CI would be less than 98.0836 or greater than 98.4149
inference(df$temperature, est = "mean", method = "theoretical", type = "ci", conflevel = 0.99, boot_method = "perc")

## Single mean 
## Summary statistics:

## mean = 98.2492 ;  sd = 0.7332 ;  n = 130 
## Standard error = 0.0643 
## 99 % Confidence interval = ( 98.0836 , 98.4149 )

# 4.Is there a significant difference between males and females in normal temperature?.Set up and solve for a two sample hypothesis testing.
# H0 : There is no difference between male and female normal temperatures
# HA : There is significant different between male and female normal temperatures
male <- subset(df, df$gender=="M")
female <- subset(df, df$gender=="F")
#inference(male, female, est = "proportion", method = "theoretical", type = "ht", conflevel = 0.95, boot_method="perc", alternative = "twosided")
# Pooled T-test
t.test(female$temperature, male$temperature, mu=0, var.equal = TRUE)

## 
##  Two Sample t-test
## 
## data:  female$temperature and male$temperature
## t = 2.2854, df = 128, p-value = 0.02393
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.03882216 0.53963938
## sample estimates:
## mean of x mean of y 
##  98.39385  98.10462

t.test(female$temperature, male$temperature, mu=0)

## 
##  Welch Two Sample t-test
## 
## data:  female$temperature and male$temperature
## t = 2.2854, df = 127.51, p-value = 0.02394
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.03881298 0.53964856
## sample estimates:
## mean of x mean of y 
##  98.39385  98.10462

# Using a two-sided test
t.test(male$temperature, female$temperature, mu=0, var.equal = TRUE)

## 
##  Two Sample t-test
## 
## data:  male$temperature and female$temperature
## t = -2.2854, df = 128, p-value = 0.02393
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.53963938 -0.03882216
## sample estimates:
## mean of x mean of y 
##  98.10462  98.39385

t.test(male$temperature, female$temperature, mu=0)

## 
##  Welch Two Sample t-test
## 
## data:  male$temperature and female$temperature
## t = -2.2854, df = 127.51, p-value = 0.02394
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.53964856 -0.03881298
## sample estimates:
## mean of x mean of y 
##  98.10462  98.39385

answers.R

edsar

Wed Dec 23 15:09:38 2015