#UTAustinX: UT.7.21x Foundations of Data Analysis - Part 2
#Statistical Inference

#install library packages
library(ggplot2)
library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
###########Week 1: Sampling####################

#sample mean = estimate of population mean
#sampling error = the sample mean  may not exactly match the population mean
#distribution of sample means is normal
#regardless of population the sampling distribution will be normal
#the sampling distribution can utilize the normal model (the z-distribution)
#SD of Sampling Distrubition is also called the "standard error"
#the larger the n for each of the samples that we take to construct the
#distribution of sample means, the smaller the standard error

#############Central Limit Theorem (CLT)##############
# The distribution of sample statistics is nearly normal,
# centered at the population mean, with a standard deviation
# equal to the population standard deviation divied by
# square root of the sample size.


# Conditions for CLT:
#   1. Independence: Sampled observations must be independent.
#   random sample/assignment
#   if sampling without replacement, n < 10% of population
#   2. Sampe size/skew: Either the population distribution is
#   normal, or if the population distribution is skewed, the
#   sample size is large (rule of thumb: n>30)

#Focus on:
# sample size/skew condition

#############CLT for the mean (examples)##############
# Suppose my ipod as 3000 songs. The mean length is 3.45
# minutes and the standard deviation is 1.63 minutes.
# Calculate the probability that a randomly selected 
# song lasts more than 5 minutes.
#x= length of one song
#P(x>5)
#500 of the 3000 sounds are over 5 minutes
#divide the amount over the the 5 minutes by the total
#amount of songs
round(500/3000,2)
## [1] 0.17
#Answer.17 or 17% of the songs on my iPod last over 5 minutes

# I'm about to take a trip to visit my parents and the drive is
# 6 hours long. I make a random playlist of 100 songs. What is
# the probability that my playlist lasts the entire drive?

  # 6 hours = 360 minutes
  # p(x1+x2+x3...etc> 360)
  # p(x>3.6)=?
  # 
  # So we want the average length to be greater than 3.6
  # minutes. Remember, this is not the same thing as every
  # single song being more than 300 3.6 minutes. Because
  # that would give me a very, very long playlist. That would
  # tell me that the minimum length of that playlist would be
  # 360 minutes. I just want the total to be greater than 360
  # minutes to last me the entire drive. 

pop_mean<-3.45
pop_sd<-1.63
sample_mean<-3.6 #this is a sample mean, not an individual observation
sample_size<-100
#6 hours = 360 minutes
360/1000
## [1] 0.36
#each song can be an average length of 3.6 minutes #this will be the sample mean
#standard error of the mean as standard deviation/squareroot(sample size)
sem<-print(pop_sd/sqrt(sample_size))
## [1] 0.163
#NExt we calculate the zscore
zscore<-(sample_mean-pop_mean)/(pop_sd/sqrt(sample_size))
#or
zscore<-(sample_mean-pop_mean)/sem #divide by standard error
zscore
## [1] 0.9202454
#P(x>.92)=.179
1-pnorm(zscore) #calculate probablity using pnorm function
## [1] 0.1787223
pnorm(zscore, lower.tail = FALSE) #OR 
## [1] 0.1787223
#So there's almost an 18% chance that my playlist lasts
#at least the entire drive

#Another Example: Distributions of random variables
# A housing survey was conducted to determine the price of a typical
#home in Topanga, CA. The mean price of a house was roughly $1.3mill with a
#standard deviation of $300,000. There were no houses listed below
#$600,000 but a few houses above $3mill.
#What is the probability that the mean of 60 randomly chosen houses
#in Topanga is more than $1.4 million.

pop_mean<-1300000
pop_sd<-300000
sample_mean<-1400000 #this is a sample mean, not an individual observation
sample_size<-60
sem<-print(pop_sd/sqrt(sample_size))
## [1] 38729.83
sem
## [1] 38729.83
zscore<-(sample_mean-pop_mean)/sem #divide by standard error
zscore
## [1] 2.581989
#(P mean >= 1400000 )
pnorm(zscore, lower.tail = FALSE)
## [1] 0.004911637
#P(Z>2.58)=.0049


#Another Example: Distributions of random variables (Overweight Baggage)
# Suppose weights of the checked baggge of arline passengers follow a
# nearly normal distribution with a mean 45 pounds and standard
# deviation 3.2 pounds. What is the probability that the total weight
# of 10 bags is greater than 460 lbs?
460/10
## [1] 46
pop_mean<-45
pop_sd<-3.2
sample_mean<-46 #this is a sample mean, not an individual observation 
sample_size<-10 #n is below 30
sem<-print(pop_sd/sqrt(sample_size))
## [1] 1.011929
sem
## [1] 1.011929
zscore<-(sample_mean-pop_mean)/sem #divide by standard error
zscore #=.9882118
## [1] 0.9882118
#(P total >= 460)
pnorm(zscore, lower.tail = FALSE)
## [1] 0.1615245
#P(Z>.9882118)=0.1615245 or 16% probability that the total weight
# of 10 bags is greater than 460 lbs


#############The Sampling Distribution##############
#Comprehension Checck
#1.  Below are two sampling distributions showing the average library fines
#owed by patrons of the Chicago public library system. They are drawn
#from the same population.
#1a. Which distribution shows means calculated from larger sized samples?
#Answer: Distribution B


#1b. What is the likely average library fine owed by a Chicago Public Library patron?
#Answer: $4.50

#1c. What can be said about the variability in the sample means for both these distributions?
#The sample means in Distribution B are more similar to each other, resulting in a tighter cluster of values. 

#1d. If we were to repeat the survey using larger sample sizes than either of those in Distribution A and Distribution B, what would you predict to be the shape of the new distribution?
#Bell-shaped, but taller and more narrow that the other distributions

#2a. Which sample size resulted in the tightest clustering of values around the "true" population mean?
#n=10

#2b. Which sample size resulted in the lowest variability in sample means?
#n=10

#2c. What happens to the shape of the sampling distribution as the sample size increases from 1 to 10?
#It becomes more normal (bell-shaped).

#2d. How many of the sample means, when N=10, fall within 1 standard deviation of the true population mean?
#About 68%

##############The Confidence Interval##############

#1.  The average resting pulse of adult women is about 74 beats per minute. 
#The standard deviation in the population is believed to be 13 beats per
#minute. The distribution of resting pulse rates is known to be skewed right.

pop_mean<-74
pop_sd<-13
sample_mean<-77
sample_size<-36

#1a. You take a sample of 36 women from this population and obtain a 
#sample mean of 77 beats per minute. What is the shape of the sampling
#distribution from which your mean comes?
#The sampling distribution will be nearly Normal because the sample size of 36 is sufficiently large.

#1b. What value should be at the center of the sampling distribution?
#74 beats per minute

#1c. How much variation should we expect in sample means given a
#sample size of 36? In other words, what is the value of the
#standard error?
#standard error of the mean as standard deviation/squareroot(sample size)
sem<-13/sqrt(36)
round(sem,2)
## [1] 2.17
#1d. What is the z-score of our sample mean of 77 in the sampling
#distribution given sample sizes of 36? (Round to 2 decimal places.)

#Z Score Formula: Standard Error of the Mean
#z = (x – μ) / (σ / √n)
#When you have multiple samples and want to describe the standard deviation of those sample means (the standard error)

zscore<-(sample_mean-pop_mean)/(pop_sd/sqrt(sample_size))
round(zscore,2)
## [1] 1.38
#or
z<-(77-74)/(13/sqrt(36))
round(z,2)
## [1] 1.38
#1e. What is the probability of observing our sample mean of 77,
#1e. What is the probability of observing our sample mean of 77,
#or one that is larger, when the true population mean is 74 beats
#or one that is larger, when the true population mean is 74 beats
#per minute?
#per minute?
#using zscore table-zscore=1.38
#using zscore table-zscore=1.38
#browseURL("http://www.stat.ufl.edu/~athienit/Tables/Ztable.pdf")
#browseURL("http://www.stat.ufl.edu/~athienit/Tables/Ztable.pdf")
.9162
.9162
## [1] 0.9162
#P(x>77)
#P(x>77)
answer<-print(1-.9162)
answer<-print(1-.9162)
## [1] 0.0838
## [1] 78.0343
#1f. Construct a 95% confidence interval around your sample mean of 77 beats per minute using this formula:
#1f. Construct a 95% confidence interval around your sample mean of 77 beats per minute using this formula:
z<-1.96 #for 95% confidence
z<-1.96 #for 95% confidence
## [1] 16.3
upperBound<-sample_mean+1.96*(pop_sd/sqrt(sample_size)) #to the right of the mean
upperBound<-sample_mean+1.96*(pop_sd/sqrt(sample_size)) #to the right of the mean
lowerBound<-sample_mean-1.96*(pop_sd/sqrt(sample_size))#to the left of the mean
lowerBound<-sample_mean-1.96*(pop_sd/sqrt(sample_size))#to the left of the mean
round(print(c(lowerBound, upperBound)),2)
round(print(c(lowerBound, upperBound)),2)
## [1] 72.75333 81.24667
## [1] 72.75 81.25
#What are the bounds of the 95% confidence interval?
#What are the bounds of the 95% confidence interval?
round(print(c(lowerBound, upperBound)),2)
round(print(c(lowerBound, upperBound)),2)
## [1] 72.75333 81.24667
## [1] 72.75 81.25
#1g. Does the 95% confidence interval give you results that are 
#1g. Does the 95% confidence interval give you results that are 

#consistent or inconsistent with your earlier z-score?
#consistent or inconsistent with your earlier z-score?
## [1]   0  70  80  90 100
## [1] 78.0343
#Consistent. The 95% confidence interval shows that the population mean
#Consistent. The 95% confidence interval shows that the population mean
## [1] 16.30906
#from which the sample was drawn could be 74. Similarly, the z-score
#from which the sample was drawn could be 74. Similarly, the z-score
#of the sample mean places it clearly within the expected sample mean
#of the sample mean places it clearly within the expected sample mean
#values for a known population mean of 74.
#values for a known population mean of 74.


#2.  Consumer reports tested 14 brands of vanilla yogurt and found the following numbers of calories per serving:
#2.  Consumer reports tested 14 brands of vanilla yogurt and found the following numbers of calories per serving:

calories<- c(180, 200, 190, 230, 80, 160, 170,

calories<- c(180, 200, 190, 230, 80, 160, 170,
             130, 140, 220, 110, 120, 100, 170)
             130, 140, 220, 110, 120, 100, 170)
summary(calories)
summary(calories)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    80.0   122.5   165.0   157.1   187.5   230.0
hist(calories)
hist(calories)

#Kernel density plots are usually a much more effective way to view the distribution of a variable.
#Kernel density plots are usually a much more effective way to view the distribution of a variable.

d<-density(calories)#plot kernel density
d<-density(calories)#plot kernel density
plot(d)
plot(d)

####Using sample function to take a sample 
####Using sample function to take a sample 
## [1] 78.1466
caloriesSample<-sample(calories, 7, replace = FALSE) #sample of 7
caloriesSample<-sample(calories, 7, replace = FALSE) #sample of 7
## [1] 7.293755
hist(caloriesSample)
hist(caloriesSample)

summary(caloriesSample)
summary(caloriesSample)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    80.0   105.0   120.0   142.9   180.0   230.0
e<-density(caloriesSample)
e<-density(caloriesSample)
## [1] 7.293635
plot(e)
plot(e)

library(psych)
library(psych)
describe(caloriesSample)
describe(caloriesSample)
##    vars n   mean    sd median trimmed  mad min max range skew kurtosis
## X1    1 7 142.86 54.69    120  142.86 59.3  80 230   150 0.36    -1.67
##       se
## X1 20.67

#2a. What is the sample mean? (Round to 2 decimal places.)
describe(calories)
#2a. What is the sample mean? (Round to 2 decimal places.)
describe(calories)
##    vars  n   mean    sd median trimmed   mad min max range  skew kurtosis
## X1    1 14 157.14 45.48    165   157.5 51.89  80 230   150 -0.05    -1.29
##       se
## X1 12.15
round(mean(calories),2)
## [1] 157.14
round(mean(calories),2)

## [1] 78.038
#2b. If the standard deviation reported by the yogurt industry is
#2b. If the standard deviation reported by the yogurt industry is
## [1] 4.211372
#48.5 calories, how much variation should we expect between sample
#48.5 calories, how much variation should we expect between sample
## [1] 4.210982
#means for samples of size 14? (Round to 2 decimal places.)
#means for samples of size 14? (Round to 2 decimal places.)


#standard error of the mean as standard deviation/squareroot(sample size)
#standard error of the mean as standard deviation/squareroot(sample size)
SEM<-48.5/sqrt(14)
SEM<-48.5/sqrt(14)
round(SEM,2)
## [1] 12.96
round(SEM,2)

#2c. What is the margin of error if the standard deviation is

#2c. What is the margin of error if the standard deviation is
#48.5 and the samples are of size 14? (Assume 95% confidence
#48.5 and the samples are of size 14? (Assume 95% confidence

#and round to 1 decimal place.)
#and round to 1 decimal place.)
## [1] 78.06848
#The Margin of Error can be calculated in two ways:
#The Margin of Error can be calculated in two ways:
## [1] 3.179129
## [1] 3.261813
#Margin of error = Critical value x Standard deviation
#Margin of error = Critical value x Standard deviation
#Margin of error = Critical value x Standard error of the statistic
#Margin of error = Critical value x Standard error of the statistic
#The critical value is either a t-score or a z-score. 
#The critical value is either a t-score or a z-score. 


alpha<-1-.95 #compute alpha
alpha<-1-.95 #compute alpha
p<-1-alpha/2 #compute critical probability
p<-1-alpha/2 #compute critical probability
## [1] 78.1466
degreesFreedom<-14-1 #degrees of freedom
degreesFreedom<-14-1 #degrees of freedom
## [1] 78.038
criticalValue<-1.96 #critical value
criticalValue<-1.96 #critical value
## [1] 78.06848
SEM #bring back standard error of the mean
SEM #bring back standard error of the mean
## [1] 12.96217
marginError<-criticalValue*SEM
marginError<-criticalValue*SEM
round(marginError,1)
## [1] 25.4
round(marginError,1)
## [1] 7.293633
#Answer: 25.4
#Answer: 25.4
## [1] 4.210981
## [1] 3.261812
sample_mean<-mean(calories)
sample_mean<-mean(calories)
pop_sd<-48.5
pop_sd<-48.5
sample_size<-14
sample_size<-14


z<-1.96 #for 95% confidence
z<-1.96 #for 95% confidence
upperBound<-sample_mean+1.96*(pop_sd/sqrt(sample_size)) #to the right of the mean
upperBound<-sample_mean+1.96*(pop_sd/sqrt(sample_size)) #to the right of the mean
lowerBound<-sample_mean-1.96*(pop_sd/sqrt(sample_size))#to the left of the mean
lowerBound<-sample_mean-1.96*(pop_sd/sqrt(sample_size))#to the left of the mean
round(print(c(lowerBound, upperBound)),1)
## [1] 131.7370 182.5487
## [1] 131.7 182.5
round(print(c(lowerBound, upperBound)),1)

#Finding the critical value 95% confidence
#Finding the critical value 95% confidence
(1-.95)/2 #equals .025 or 2.5%
(1-.95)/2 #equals .025 or 2.5%
## [1] 0.025
#Since the total area under the curve is one and the curve is symmetric, leaving equally sizes tails on each side.
#Since the total area under the curve is one and the curve is symmetric, leaving equally sizes tails on each side.
#Next: locate .025 within the table to get the zscore
#Next: locate .025 within the table to get the zscore
#The critical values in a confidence interval formula are always positive
#The critical values in a confidence interval formula are always positive
qnorm(.025) #use qnorm when you want to find the cutoff values
qnorm(.025) #use qnorm when you want to find the cutoff values
## [1] -1.959964
#take the positive version to use as the critial value
#take the positive version to use as the critial value


#What is the critical value for the 98% confidence level?
#What is the critical value for the 98% confidence level?
(1-.98)/2
## [1] 0.01
(1-.98)/2
## [1] 78.03
qnorm(.01) #Answer: 2.33
qnorm(.01) #Answer: 2.33
## [1] -2.326348
#2d. What is the 95% confidence interval for the average calorie
#2d. What is the 95% confidence interval for the average calorie
#content of vanilla yogurt? (Round to 1 decimal place.)
#content of vanilla yogurt? (Round to 1 decimal place.)
## [1] 7.29
round(print(c(lowerBound, upperBound)),1)
round(print(c(lowerBound, upperBound)),1)
## [1] 131.7370 182.5487
## [1] 131.7 182.5
#2e. The FDA reports that the average caloric content of vanilla yogurt is 150 calories. Does your confidence interval support or refute this claim?
#2e. The FDA reports that the average caloric content of vanilla yogurt is 150 calories. Does your confidence interval support or refute this claim?
#Support
#Support
## [1] 4.21
##############Descriptive Stats Tutorial##############
##############Descriptive Stats Tutorial##############
#Univariate Descriptive Statistics
#Univariate Descriptive Statistics
## [1] 3.26
#import dataset from url
#import dataset from url
animaldata<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/78cc1365ee7f9798d6dfd02cb35aab74/asset-v1:UTAustinX+UT.7.11x+2T2017+type@asset+block/AnimalData.csv")
animaldata<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/78cc1365ee7f9798d6dfd02cb35aab74/asset-v1:UTAustinX+UT.7.11x+2T2017+type@asset+block/AnimalData.csv")


mean(animaldata$Age.Intake)
mean(animaldata$Age.Intake)
## [1] 2.348837
median(animaldata$Age.Intake)
median(animaldata$Age.Intake)
## [1] 1
#what is the spread?--amount of variability within a data set--the standard dev.
#what is the spread?--amount of variability within a data set--the standard dev.
sd(animaldata$Age.Intake)
sd(animaldata$Age.Intake)
## [1] 3.099837
#the five number summary
#the five number summary
fivenum(animaldata$Age.Intake)
fivenum(animaldata$Age.Intake)
## [1]  0  0  1  3 17
#So with just a few simple functions we can pretty fully describe the single numerical variable age-at-intake by either reporting the mean and standard deviation together or the median and including the five-number summary.
#So with just a few simple functions we can pretty fully describe the single numerical variable age-at-intake by either reporting the mean and standard deviation together or the median and including the five-number summary.


################Sampling Distributions Tutorial############
################Sampling Distributions Tutorial############


#Student Survey Dataset
#Student Survey Dataset
survey<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/4f9dc97be6bf95ac780db574b3541586/asset-v1:UTAustinX+UT.7.21x+2T2017+type@asset+block/StudentSurvey.csv")
survey<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/4f9dc97be6bf95ac780db574b3541586/asset-v1:UTAustinX+UT.7.21x+2T2017+type@asset+block/StudentSurvey.csv")




#what is the distribution of the students ages?
#what is the distribution of the students ages?
mean(survey$age)
mean(survey$age)
## [1] 19.25594
sd(survey$age)
sd(survey$age)
## [1] 2.269055
hist(survey$age)
hist(survey$age)

sample(survey$age, size = 30)
sample(survey$age, size = 30)
##  [1] 18 24 18 19 18 19 18 19 19 19 19 21 18 18 19 21 19 19 18 18 19 19 19
## [24] 18 18 18 18 18 19 20
myxbar <- rep(NA,1000)
myxbar <- rep(NA,1000)
  for (i in 1:1000){
  for (i in 1:1000){
    mysamp <- sample(survey$age, size = 30)
    myxbar[i] <- mean(mysamp)
  }
    mysamp <- sample(survey$age, size = 30)
    myxbar[i] <- mean(mysamp)
  }
hist(myxbar)
hist(myxbar)

mean(myxbar) #very close to the population mean
mean(myxbar) #very close to the population mean
## [1] 19.26947
sd(myxbar)
sd(myxbar)
## [1] 0.3974797
sd(survey$age)/sqrt(30) #not exactly the same but very close
sd(survey$age)/sqrt(30) #not exactly the same but very close
## [1] 0.4142708
#############Week 1: Sampling Pre-Lab#####################
#############Week 1: Sampling Pre-Lab#####################
#In this lab, we will examine how sample data can be used to discover the truth
#In this lab, we will examine how sample data can be used to discover the truth
#about a population.  Our population data consists of data we collected from our
#about a population.  Our population data consists of data we collected from our
#statistics students here at The University of Texas at Austin.
#statistics students here at The University of Texas at Austin.
#They told us several things about themselves, including how happy they
#They told us several things about themselves, including how happy they
#are and the amount of time they study.  We'll run a few simulations on
#are and the amount of time they study.  We'll run a few simulations on
#this data to see if we can replicate what the Central Limit Theorem tells
#this data to see if we can replicate what the Central Limit Theorem tells
#us about sampling. We are pretending that we don't know the "true" population
#us about sampling. We are pretending that we don't know the "true" population
#parameters, but in fact we do!
#parameters, but in fact we do!
  
  
  
  
  #Primary Research Question
  #Primary Research Question


#How many letters long is the typical UT student’s name?
#How many letters long is the typical UT student’s name?
#How does our estimate change as we increase the size of our sample?
#How does our estimate change as we increase the size of our sample?
##  [1] "ID"             "gender"         "age"            "classification"
##  [5] "name_letters"   "happy"          "concerts"       "hair_color"    
##  [9] "own_shoes"      "greek"          "live_campus"    "roomates"      
## [13] "austin"         "birth_month"    "commute"        "car"           
## [17] "sport"
#Check the Data
#Check the Data

#1a) How many students are in this dataset?
#1a) How many students are in this dataset?
glimpse(survey)
glimpse(survey)
## Observations: 379
## Variables: 17
## $ ID             <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ...
## $ gender         <chr> "Male", "Female", "Female", "Male", "Female", "...
## $ age            <int> 17, 19, 18, 19, 19, 19, 18, 18, 19, 19, 18, 23,...
## $ classification <chr> "Freshman", "Sophomore", "Freshman", "Sophomore...
## $ name_letters   <int> 4, 8, 6, 4, 7, 5, 5, 8, 8, 5, 7, 6, 4, 3, 8, 7,...
## $ happy          <int> 80, 76, 50, 75, 89, 90, 57, 60, 75, 100, 2, 75,...
## $ concerts       <int> 2, 15, 3, 0, 1, 1, 0, 0, 1, 4, 0, 1, 5, 0, 1, 0...
## $ hair_color     <chr> "black", "black", "brown", "black", "brown", "b...
## $ own_shoes      <int> 108, 42, 6, 10, 13, 12, 12, 6, 15, 25, 5, 6, 8,...
## $ greek          <chr> "yes", "no", "no", "yes", "no", "yes", "no", "n...
## $ live_campus    <chr> "yes", "no", "yes", "no", "no", "no", "yes", "y...
## $ roomates       <int> 1, 1, 1, 1, 2, 3, 1, 1, 1, 3, 1, 1, 1, 1, 3, 1,...
## $ austin         <dbl> 10, 10, 10, 8, 9, 10, 7, 8, 8, 8, 4, 9, 8, 8, 1...
## $ birth_month    <int> 11, 12, 11, 5, 5, 12, 8, 9, 6, 3, 5, 8, 5, 12, ...
## $ commute        <chr> "walk", "drive by myself", "walk", "bus", "bus"...
## $ car            <chr> "no", "no", "no", "no", "no", "yes", "no", "no"...
## $ sport          <chr> "yes", "no", "no", "no", "no", "no", "no", "no"...
#1b) How many of the first 10 students in the dataset had names longer than 5 letters?
#1b) How many of the first 10 students in the dataset had names longer than 5 letters?
## [1] 8.39
first10<-survey[1:10,]
first10<-survey[1:10,]
      sqldf('SELECT COUNT(*)
      sqldf('SELECT COUNT(*)
              FROM first10
              WHERE name_letters > 5')
## Loading required package: tcltk
## Warning: Quoted identifiers should have class SQL, use DBI::SQL() if the
## caller performs the quoting.
##   COUNT(*)
## 1        5
              FROM first10
              WHERE name_letters > 5')
## [1] 1.51
#1c) How long is the name of the first student in the dataset who is happy less
#1c) How long is the name of the first student in the dataset who is happy less
      #than 40% of the time?
      #than 40% of the time?
print(unhappy<-sqldf('SELECT *
print(unhappy<-sqldf('SELECT *
        FROM survey
        WHERE happy < 40
        LIMIT 1' ))
##   ID gender age classification name_letters happy concerts hair_color
## 1 11   Male  18         Junior            7     2        0      black
##   own_shoes greek live_campus roomates austin birth_month commute car
## 1         5    no          no        1      4           5     bus  no
##   sport
## 1   yes
        FROM survey
        WHERE happy < 40
        LIMIT 1' ))


#Check the Variables of Interest
#Check the Variables of Interest
## [1] 8.39
#Let's find the variables we need to answer the question.
#Let's find the variables we need to answer the question.


#2a) Which variable tells us how many letters are in each student’s
#2a) Which variable tells us how many letters are in each student’s
## [1] 0.48
#first name? The name of this variable in the dataset is:
#first name? The name of this variable in the dataset is:
head(survey$name_letters)
head(survey$name_letters)
## [1] 4 8 6 4 7 5
#2b) What type of variable is this--categorical or quantitative?
#2b) What type of variable is this--categorical or quantitative?
#quantitative
#quantitative


#Reflect on the Method
#Reflect on the Method
#Which method should we be using for this analysis and why?
#Which method should we be using for this analysis and why?


#3a) What makes something a sampling distribution?

#3a) What makes something a sampling distribution?

#3b) What does the Central Limit Theorem predict about a sampling
#3b) What does the Central Limit Theorem predict about a sampling

#distribution of means?
#distribution of means?
## [1] 8.40875
## [1] 0.4539791
###########Prepare for the Analysis###############
###########Prepare for the Analysis###############


# Determine the population parameters:
# Determine the population parameters:
#   1. Visualize the shape of the population data by making a histogram.  
#   1. Visualize the shape of the population data by making a histogram.  
#   2. Calculate the “true” mean and standard deviation of the population.
#   2. Calculate the “true” mean and standard deviation of the population.
# 
# 
# 
# 
# Compare the sample statistics:  
# Compare the sample statistics:  
#   3. Draw 1,000 samples of size n=5 from the population data.  Calculate the mean of each sample. 
#   3. Draw 1,000 samples of size n=5 from the population data.  Calculate the mean of each sample. 
#   4. Graph these 1,000 sample means in a histogram and examine the shape.
#   4. Graph these 1,000 sample means in a histogram and examine the shape.
#   5. Calculate the mean and standard deviation of the sampling distribution.
#   5. Calculate the mean and standard deviation of the sampling distribution.
#   6. Repeat this process for samples of size n=15 and n=25.
#   6. Repeat this process for samples of size n=15 and n=25.
#   7. Compare the results you get to the predictions of the Central Limit Theorem.
#   7. Compare the results you get to the predictions of the Central Limit Theorem.


  # Calculate the population parameters
  # Calculate the population parameters
  hist(survey$name_letters)

  hist(survey$name_letters)
  fivenum(survey$name_letters)
## [1]  2  5  6  7 10
  fivenum(survey$name_letters)
  mean(survey$name_letters)
## [1] 5.970976
  mean(survey$name_letters)
  sd(survey$name_letters)
  sd(survey$name_letters)
## [1] 1.49486
## [1] 0.3
  # Draw 1,000 samples of n=5 and find the mean of each sample.
  # Draw 1,000 samples of n=5 and find the mean of each sample.
  xbar5 <-rep(NA, 1000)
  xbar5 <-rep(NA, 1000)
  for (i in 1:1000)
  for (i in 1:1000)
  {x <-sample(survey$name_letters, size =5)
  xbar5[i] <-  mean(x)}
  {x <-sample(survey$name_letters, size =5)
## [1] 0.6179
  xbar5[i] <-  mean(x)}
## [1] 0.382
  # Graph the histogram of 1,000 sample means.
  # Graph the histogram of 1,000 sample means.
  hist(xbar5,xlim=c(2,10))
  hist(xbar5,xlim=c(2,10))

## [1] 3.08
  # Calculate the mean and sd of the sampling distribution.
  # Calculate the mean and sd of the sampling distribution.
  mean(xbar5)
  mean(xbar5)
## [1] 5.9654
  sd(xbar5)
  sd(xbar5)
## [1] 0.6578721
  # Compare to the std dev predicted by the CLT.
  # Compare to the std dev predicted by the CLT.
  sd(survey$name_letters)/sqrt(5)
## [1] 0.6685215
  sd(survey$name_letters)/sqrt(5)
## [1] 0.08
  #Repeat for samples of size n=15
  #Repeat for samples of size n=15
  xbar15 <-rep(NA, 1000)
  xbar15 <-rep(NA, 1000)
  for (i in 1:1000)
  for (i in 1:1000)
  {x <-sample(survey$name_letters, size =15)
  xbar15[i] <- mean(x)}
  {x <-sample(survey$name_letters, size =15)
  xbar15[i] <- mean(x)}
  hist(xbar15,xlim=c(2,10))
  hist(xbar15,xlim=c(2,10))

  mean(xbar15)
  mean(xbar15)
## [1] 5.971533
  sd(xbar15)
  sd(xbar15)
## [1] 0.3682824
  sd(survey$name_letters)/sqrt(15)
  sd(survey$name_letters)/sqrt(15)
## [1] 0.3859711
  #Repeat for samples of size n=25
  #Repeat for samples of size n=25
  xbar25 <-rep(NA, 1000)
  xbar25 <-rep(NA, 1000)
  for (i in 1:1000)
  {x <-sample(survey$name_letters, size =25)
  xbar25[i] <- mean(x)}
  for (i in 1:1000)
  {x <-sample(survey$name_letters, size =25)
  xbar25[i] <- mean(x)}
  hist(xbar25,xlim=c(2,10))

  hist(xbar25,xlim=c(2,10))
  mean(xbar25)
## [1] 5.96528
  mean(xbar25)
## [1] -2.25
  sd(xbar25)
  sd(xbar25)
## [1] 0.2981708
  sd(survey$name_letters)/sqrt(25)
  sd(survey$name_letters)/sqrt(25)
## [1] 0.2989719
#1a) What is x? from above
#1a) What is x? from above
  #x is a sample of 5 data values drawn from the population
  #x is a sample of 5 data values drawn from the population
## [1] 0.0122
## [1] 0.012
#1b) What is mean(x)?
#1b) What is mean(x)?
  #It is the mean of the 5 data points drawn in each sample.
  #It is the mean of the 5 data points drawn in each sample.
  
  
#1c) When the loop is in the 200th iteration (i=200),
#1c) When the loop is in the 200th iteration (i=200),
  #what will the following code be doing:
  #what will the following code be doing:
  
  
  xbar5[i] <- mean(x)
  xbar5[i] <- mean(x)
  #Calculating the mean of the 200th sample, and placing it in the 200th position of xbar5 vector.
  #Calculating the mean of the 200th sample, and placing it in the 200th position of xbar5 vector.
  
  
#2) The standard deviation of a sampling distribution is
#2) The standard deviation of a sampling distribution is
  #called a "standard error." What goes in the denominator
  #called a "standard error." What goes in the denominator
  #of this equation to solve for standard error (SE)? SE=σ / ?
  #of this equation to solve for standard error (SE)? SE=σ / ?
## [1] 1.5
  #standard error of the mean as standard deviation/squareroot(sample size)
  #standard error of the mean as standard deviation/squareroot(sample size)
  #Answer: √n
  #Answer: √n
## [1] 0.9332
#3) We used the following code to try to show the sampling distribution of ages:
#3) We used the following code to try to show the sampling distribution of ages:
## [1] 0.921
  xbar5 <-rep(NA, 1000)
  xbar5 <-rep(NA, 1000)
## [1] 92.09683
  for (i in 1:1000)
  for (i in 1:1000)
  {x <-sample(survey$age, size =5)
  xbar5[i] <- mean(x)}
  {x <-sample(survey$age, size =5)
  xbar5[i] <- mean(x)}
  hist(xbar5,xlim=c(2,10))
  hist(xbar5,xlim=c(2,10))

#Why was the histogram that R produced blank?
#Why was the histogram that R produced blank?
  #the scale of the x-axis is set from 2 to 10, but the ages are not in this range.
  #the scale of the x-axis is set from 2 to 10, but the ages are not in this range.
  
  
###########Conduct the Analysis###############
###########Conduct the Analysis###############
  
  
#Population Parameters
#Population Parameters
  
  
  #1a) What is the average name length, in number of letters,
  #1a) What is the average name length, in number of letters,
  #for all of the students in the population?
  #for all of the students in the population?
  #(Round to 2 decimal places.)
  #(Round to 2 decimal places.)
  
  
round(mean(survey$name_letters),2)
## [1] 5.97
round(mean(survey$name_letters),2)
## [1] -1.6
#1b) By how many letters, on average, do names vary from the
#1b) By how many letters, on average, do names vary from the
  #mean? (Round to 2 decimal places.)
  #mean? (Round to 2 decimal places.)
## [1] 0.9452007
round(sd(survey$name_letters),2)
## [1] 1.49
round(sd(survey$name_letters),2)
## [1] 0.05479929
#2) In this lab, each time we sampled from our population we
#2) In this lab, each time we sampled from our population we
#kept the ________ the same at 1,000, but we increased
#kept the ________ the same at 1,000, but we increased
#the ________ from 5 to 25.
#the ________ from 5 to 25.
#number of samples
#number of samples
#sample size
#sample size


#Observing the Sampling Distributions
#Observing the Sampling Distributions


#3a) The mean was ________ for all three sampling distributions.
#3a) The mean was ________ for all three sampling distributions.
mean(xbar5)
mean(xbar5)
## [1] 19.2188
mean(xbar15)
mean(xbar15)
## [1] 5.971533
mean(xbar25)
mean(xbar25)
## [1] 5.96528
#Answer: About the same
#Answer: About the same


#3b) The size of the standard error ________ as the sample size increased from 5 to 25.
#3b) The size of the standard error ________ as the sample size increased from 5 to 25.
sd(xbar5)
sd(xbar5)
## [1] 0.9730948
sd(xbar15)
## [1] 0.3682824
sd(xbar15)
sd(xbar25)
## [1] 0.2981708
sd(xbar25)
#Answer: Decreased
#Answer: Decreased


#3c) The distributions became more and more ________ as the sample size increased.
#3c) The distributions became more and more ________ as the sample size increased.


h1<-ggplot(data = NULL, aes(x = xbar5)) +
  geom_histogram(binwidth = 1, color = "black", fill = "white") +
  ggtitle("Xbar5")
h1<-ggplot(data = NULL, aes(x = xbar5)) +
## [1] 3.095491
  geom_histogram(binwidth = 1, color = "black", fill = "white") +
  ggtitle("Xbar5")


h2<-ggplot(data = NULL, aes(x = xbar15)) +
  geom_histogram(binwidth = 1, color = "black", fill = "white") +
  ggtitle("Xbar15")
h2<-ggplot(data = NULL, aes(x = xbar15)) +
## [1] 0.0009824358
  geom_histogram(binwidth = 1, color = "black", fill = "white") +
## [1] 0.9990176
  ggtitle("Xbar15")


h3<-ggplot(data = NULL, aes(x = xbar25)) +
h3<-ggplot(data = NULL, aes(x = xbar25)) +
  geom_histogram(binwidth = 1, color = "black", fill = "white") +
  ggtitle("Xbar25")
  geom_histogram(binwidth = 1, color = "black", fill = "white") +
  ggtitle("Xbar25")


print(h1)

print(h1)
## [1] 28
print(h2)
print(h2)

print(h3)
print(h3)

describe(xbar5)
describe(xbar5)
##    vars    n  mean   sd median trimmed  mad  min  max range skew kurtosis
## X1    1 1000 19.22 0.97     19   19.06 0.59 17.6 25.2   7.6 1.88     4.77
##      se
## X1 0.03
describe(xbar15)
##    vars    n mean   sd median trimmed mad  min  max range  skew kurtosis
## X1    1 1000 5.97 0.37      6    5.97 0.4 4.87 7.13  2.27 -0.01    -0.21
##      se
## X1 0.01
describe(xbar15)
describe(xbar25)
##    vars    n mean  sd median trimmed mad  min  max range  skew kurtosis
## X1    1 1000 5.97 0.3      6    5.97 0.3 4.88 6.76  1.88 -0.17    -0.15
##      se
## X1 0.01
describe(xbar25)
## [1] 2.29
#normal
#normal


par(mfrow=c(1,1)) #reset graphics device
par(mfrow=c(1,1)) #reset graphics device


#According to the Central Limit Theorem:
#According to the Central Limit Theorem:
## [1] 3.1
#The central limit theorem states that the sampling
#The central limit theorem states that the sampling
#distribution of the mean of any independent,
#distribution of the mean of any independent,
#random variable will be normal or nearly normal,
#random variable will be normal or nearly normal,
#if the sample size is large enough.
#if the sample size is large enough.
## [1] 0.001
#The mean of the sampling distribution of the mean is the mean of the population from which the scores were sampled
#The mean of the sampling distribution of the mean is the mean of the population from which the scores were sampled


#4a) What is the mean of the sampling distribution (for n=5, 15, or 25)? (Round to 2 decimal places)
#4a) What is the mean of the sampling distribution (for n=5, 15, or 25)? (Round to 2 decimal places)
round(mean(survey$name_letters),2)
round(mean(survey$name_letters),2)
## [1] 5.97
#In practice, the Standard Deviation of the Sampling
#In practice, the Standard Deviation of the Sampling
#Distribution is known as the Standard Error (Se)
#Distribution is known as the Standard Error (Se)
#of the statistic computed
#of the statistic computed


# There are two important pieces to take away from this lesson.
# There are two important pieces to take away from this lesson.
# First, notice that the sample means become more and more normally
# First, notice that the sample means become more and more normally
# distributed around the true mean (the population parameter)
# distributed around the true mean (the population parameter)
# as we increase our sample size. Second, notice that the
# as we increase our sample size. Second, notice that the
# variablity of the sample means decreases as sample size increases.
# variablity of the sample means decreases as sample size increases.
# The sample means are more tightly clustered around the truemean.
# The sample means are more tightly clustered around the truemean.
# This variablity of sample means is called the standard error,s. You
# This variablity of sample means is called the standard error,s. You
# can think of it as the standard deviation of a sampling distribution.
# can think of it as the standard deviation of a sampling distribution.
# And here is one last piece of information to take away.
# And here is one last piece of information to take away.
# The sampling distribution, as it becomes more normal in
# The sampling distribution, as it becomes more normal in
# shape, also adheres to the Empirical Rule. This means that
# shape, also adheres to the Empirical Rule. This means that
# certain proportions of the sample means will fall within defined
# certain proportions of the sample means will fall within defined
# increments. In this case, each increment would be one standard error
# increments. In this case, each increment would be one standard error
# from the population parameter. According to this rule, 34% of the sample
# from the population parameter. According to this rule, 34% of the sample
# means will fall within one standard error above the population parameter,
# means will fall within one standard error above the population parameter,
# and another 34% will fall within one standard error below the population
# and another 34% will fall within one standard error below the population
# parameter. In addition, probability theory says that 95% of the samples
# parameter. In addition, probability theory says that 95% of the samples
# will fall within two standard errors of the true value, and 99.7% will
# will fall within two standard errors of the true value, and 99.7% will
# fall within three standard errors.
# fall within three standard errors.


#Using the Central Limit Theorem to Construct the Sampling Distribution
#Using the Central Limit Theorem to Construct the Sampling Distribution


#Notes to Remember
#Notes to Remember
# Shape of the sampling distribution: 
# Shape of the sampling distribution: 
## [1] 0
#   As long as your sample size is 30 or greater, you may assume the
#   As long as your sample size is 30 or greater, you may assume the
#   distribution of the sample means to be approximately normal. This
#   distribution of the sample means to be approximately normal. This
#   is true regardless of the original distribution of the random
#   is true regardless of the original distribution of the random
#   variable.
#   variable.
## [1] 0.5
# 
# 
## [1] 0.5
# The mean of the distribution: 
# The mean of the distribution: 
#   The mean of a sampling distribution, as you saw in the last
#   The mean of a sampling distribution, as you saw in the last
#   lesson, is the mean of the population. Formally: μx = μ
#   lesson, is the mean of the population. Formally: μx = μ
# 
# 
# The standard error of the distribution:The standard deviation of the
# The standard error of the distribution:The standard deviation of the
## [1] 0.387
# sample means can be estimated by dividing the standard deviation
# sample means can be estimated by dividing the standard deviation
# of the population by the square root of the sample size.
# of the population by the square root of the sample size.


#4b) What is the standard error of the sampling distribution for n=5?
#4b) What is the standard error of the sampling distribution for n=5?
round(sd(survey$name_letters)/sqrt(5),3)
round(sd(survey$name_letters)/sqrt(5),3)
## [1] 0.669
#4c) What is the standard error of the sampling distribution for n=15?
#4c) What is the standard error of the sampling distribution for n=15?
## [1] 0.3872983
round(sd(survey$name_letters)/sqrt(15),3)
round(sd(survey$name_letters)/sqrt(15),3)
## [1] 0.386
#4d) What is the standard error of the sampling distribution for n=25?
#4d) What is the standard error of the sampling distribution for n=25?
## [1] 0.759
round(sd(survey$name_letters)/sqrt(25),3)
round(sd(survey$name_letters)/sqrt(25),3)
## [1] 0.299
#5) Were the results of the simulations consistent with what the CLT predicted?
#Yes

#################Write Your Conclusion####################

# In this lab, we knew the average name length for our population of college students. The population mean was 
## [1] 470.7009 472.2191
## [1] 470.701 472.219
# 5.97  letters and the standard deviation was 1.49. The name lengths were 
# normally distributed.
# 
## [1] 470.7009 472.2191
## [1] 470.7 472.2
# We drew samples of different sizes from our population to simulate the Central Limit Theorem. In short, the CLT says three things:
#   
#   1. As sample size increases, sampling distributions become more normal.
#   2. The mean of the sampling distribution will be the population mean
#   3. The variability of the sample means can be predicted by dividing the population
#   standard deviation by the square root of the sample size.
  
  #Our simulation results were consistent with this theory. As we increased the size of our sample from 5 to 25, the sample means become
  #less variable and tended to cluster more tightly around the true mean.
 # In other words, our sample means became estimators of the true population mean.
  
  
#############Lab 1: UT Student Survey Data####################
#Review of the Central Limit Theorem
  
#In this lab, you will use a simulation to better understand the
#Central Limit Theorem. Let's start by remembering the key
#features of the Central Limit Theorem.
  
#1a) In this lab, we will draw samples to answer the following
  #question: What percentage of the time are college students happy?
  #How does the Central Limit Theorem predict our answer to this
  #question will change as sample size increases?
#Answer: As sample size increases, our sample means should become
#less variable and be closer to the true mean.
  
#1b) What does it mean to increase the sample size in a simulation?
#It means to draw more individuals in each of our samples.
  
#1c) What should be true about our sampling distributions as we increase our sample size?
#The means should remain about the same, but the standard errors should decrease. 
  
#rm(list = ls())  # Clean up

#################Lab Preparation##################

#Analyze the Data
#line 205--import data
survey<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/4f9dc97be6bf95ac780db574b3541586/asset-v1:UTAustinX+UT.7.21x+2T2017+type@asset+block/StudentSurvey.csv")
  
#Population Parameters

#1a) What is the shape of the population happiness scores?
hist(survey$happy)
#negatively skewed  
  
#1b) What percentage of the time are college students happy, on average? (report with no decimals and no %)
mean(survey$happy)

#1c) What is the standard deviation of the happiness percent scores? (round to 1 decimal place)
round(sd(survey$happy),1)

#1d) Is it more common for students to have high or low happiness percent scores relative to the range of percent scores in the population?
#high

#Simulation

# Calculate the population parameters
  hist(survey$happy)
  fivenum(survey$happy)
  mean(survey$happy)
  sd(survey$happy)
  
  
  # Draw 1,000 samples of n=5 and find the mean of each sample.
  xbar5 <-rep(NA, 1000)
  for (i in 1:1000)
  {x <-sample(survey$happy, size =5)
  xbar5[i] <-  mean(x)}
  
  
  # Graph the histogram of 1,000 sample means.
  hist(xbar5)
  
  
  # Calculate the mean and sd of the sampling distribution.
  mean(xbar5)
  sd(xbar5)
  
  # Compare to the std dev predicted by the CLT.
  sd(survey$happy)/sqrt(5)
  
  
  #Repeat for samples of size n=15
  xbar15 <-rep(NA, 1000)
  for (i in 1:1000)
  {x <-sample(survey$happy, size =15)
  xbar15[i] <- mean(x)}
  hist(xbar15)
  mean(xbar15)
  sd(xbar15)
  sd(survey$happy)/sqrt(15)
  
  
  #Repeat for samples of size n=25

  xbar25 <-rep(NA, 1000)
  for (i in 1:1000)
  {x <-sample(survey$happy, size =25)
  xbar25[i] <- mean(x)}
  hist(xbar25)
  mean(xbar25)
  sd(xbar25)
  sd(survey$happy)/sqrt(25)

#For the sampling distributions:

#2a) The mean was ________ for all three sampling distributions.
  #approximately the same
  mean(xbar5)
  mean(xbar15)
  mean(xbar25)

#2b) The sample error (SE) ________ as sample size increased.
  #decreased
16.30906/sqrt(5)
16.30906/sqrt(15)
16.30906/sqrt(25)


#2c) The distributions became _______ as sample size increased.
#more normal
par(mfrow=c(1,3))
hist(xbar5)
hist(xbar15)
hist(xbar25)
par(mfrow=c(1,1)) #reset graphics device

#Central Limit Theorem

#For the following questions, please use the rounded standard deviation
#value you provided above where necessary.

#3a) According to the Central Limit Theorem, what do we expect the
#mean to be for each sampling distribution (n=5, n=15 and n=25)?
#(round to 2 decimal places)
round(mean(survey$happy),2)

#3b) According to the Central Limit Theorem, what should be the standard
#error for the sampling distribution of n=5? (round to 2 decimal places).
round(sd(survey$happy)/sqrt(5),2)

#3c) According to the Central Limit Theorem, what should be the standard
#error for the sampling distribution of n=15? (round to 2 decimal places).
round(sd(survey$happy)/sqrt(15),2)

#3d) According to the Central Limit Theorem, what should be the standard
#error for the sampling distribution of n=25? (round to 2 decimal places).
round(sd(survey$happy)/sqrt(25),2)

#4) Based on these simulations, what can you say about the relationship
#between the shape of the population and
#the shape of the sampling distribution of means?
#Answer: If the sample size is large enough, the sampling distribution
#will be Normal no matter what the shape of the population.

#Write your conclusion
# In this lab, we knew the average percentage of the time college students
# are happy for our population of college students. The population mean was
# 78.0 % and the standard deviation was 16.31%. The happiness scores were 
# negatively skewed.We drew samples of different sizes from our population to
# simulate the Central Limit Theorem. In short, the CLT says three things:
# 
# 1. As sample size increases, sampling distributions become more Normal.
# 2. The mean of the sampling distribution will be the population mean.
# 3. The variability of the sample means, or the standard error  can be
# predicted by dividing the population standard deviation by the square root
# of the sample size.
# 
# Our simulation results were  consistent with this theory. As we increased
# the size of our sample from 5 to 25, the sample means become less variable
# and tended to cluster more tightly around the true mean.
# In other words, our sample means became better estimators of
# the true population mean. In addition, the shape of the distribution
# became more normal as sample size increased.

  
#rm(list = ls())  # Clean up

#############Week 1: Problem Set#####################

#Question 1

# On a scale of 1 to 10, how much do UT Austin students like Austin?
# 
# 1. What are the true mean and standard deviation for our population of
# UT Austin students?
# 
# 2. What should the sampling distribution of the mean look like,
# as predicted by the Central Limit Theorem?
# 
# 3. How do our simulated values compare to these predicted values?
# 
# Use the "StudentSurvey.csv" dataset to answer the following questions.
# Instructions for installing "StudentSurvey.csv" can be found under the
# Examine the Data unit in this week's Pre-Lab section.

survey<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/4f9dc97be6bf95ac780db574b3541586/asset-v1:UTAustinX+UT.7.21x+2T2017+type@asset+block/StudentSurvey.csv")

#1a. Create a histogram of the "austin" variable for the
#entire population of students that took the survey
#Which best describes the shape of the distribution?
names(survey)
hist(survey$austin)
#Left (Negative) Skewed correct

#1b. What is the population mean for the "austin" variable?
#(Round to 2 decimal places.)
round(mean(survey$austin),2)

#1c. What is the population standard deviation for the "austin" variable?
#(Report to 2 decimal places.)
round(sd(survey$austin),2)

#1d. Use the Central Limit Theorem to predict the mean and
#standard deviation of the sampling distribution of means for
#samples of size n=10 drawn from this population: 

 
#What is the expected mean? (Round to 2 decimal places.)
#SUMMARY: The mean of a sampling distribution is the mean of the population. Formally: μx = μ
round(mean(survey$austin),2)

#What is the expected standard deviation? (Round to 2 decimal places.)
# Compare to the std dev predicted by the CLT.
round(sd(survey$austin)/sqrt(10),2)

#1e. Simulate drawing 1,000 random samples of sample size n=10 from the
#"austin" distribution, then create a histogram of the sampling distribution
#and calculate it's mean and standard deviation. How do these simulated
#values compare to the those predicted by the Central Limit Theorem?

   # Draw 1,000 samples of n=10 and find the mean of each sample.
  xbar10 <-rep(NA, 1000)
  for (i in 1:1000)
## [1] 1.414214
  {x <-sample(survey$austin, size =10)
  xbar10[i] <-  mean(x)}
  hist(xbar10)
  mean(xbar10)
  sd(xbar10)
  
#ANSWER: THE VALUES ARE CLOSE TO ONE ANOTHER
## [1] "We fail to reject the null hypothesis (Ho)"
#Question 2

# A population of sunflower plants is described as having a monthly
# growth rate that follows a normal distribution with μ = 3.08 in
# and σ = 0.40 in.
# 
# Use this information to answer the following questions.
  

  
#2a. What is the probability that a randomly chosen sunflower plant
#grows more than 3.2 inches in a month? (Round to 3 decimal places.)
  pop_mean<-3.08
  pop_sd<-.40
  sample_size<-1
  sample_mean<-3.2
  
zscore<-(sample_mean-pop_mean)/(pop_sd/sqrt(sample_size))
zscore

#using zscore table-zscore=.3
#browseURL("http://www.stat.ufl.edu/~athienit/Tables/Ztable.pdf")
.6179 
round((1-.6179),3)
  
#2b. A middle-school science class grew 25 of these sunflowers.
#How many inches would they expect these flowers to have grown,
#on average, one month later? (Round to 2 decimal places.)
3.08

#2c. The middle school science teacher replicates her study
#with 25 new sunflowers every year. How much variability in
#inches should she expect in the average monthly growth of these
#samples? (Round to 2 decimal places.)
sem<-pop_sd/sqrt(25)
round(sem,2)

#2d. The science teacher notices that the average monthly growth of her 25
#sunflowers has never exceeded 3.2 inches. What should she conclude?
#μ = 3.08 in and σ = 0.40 in.

#ANSWER: We wouldn't expect to see an average monthly growth of 3.2 inches
## [1] 0.2424871
#for a sample of 25 plants. Her data is probably fine.

#2e. What is the probability that her next sample of 25
## [1] 0.2424871
#sunflowers will grow an average of more than 2.9, but less
#than 3.2 inches, in a month? (Report as a proportion rounded
#to 3 decimal places.)

  pop_mean<-3.08
  pop_sd<-.40
## [1] "We fail to reject the null hypothesis (Ho)"
  sample_size<-25
  sample_mean1<-2.9
  
  zscore1<-(sample_mean1-pop_mean)/(pop_sd/sqrt(sample_size))
  zscore1

  #P(2.9>x<3.2)
#using zscore table-zscore=-2.25
#browseURL("http://www.stat.ufl.edu/~athienit/Tables/Ztable.pdf")
.0122 #this is just for the average of 2.9
round((.0122),3)

##prnom gives you more than 2.9
more2point9<-pnorm(zscore1)


  pop_mean<-3.08
  pop_sd<-.40
  sample_size<-25
  sample_mean2<-3.2
  
  zscore2<-(sample_mean2-pop_mean)/(pop_sd/sqrt(sample_size))
  zscore2
  #zscore=1.5
  .9332 #from ztable
  less3point2<-pnorm(zscore2)#under the curve

round(less3point2-more2point9,3) #probability
(less3point2-more2point9)*100 #percentage

#Answer: .921 or 92.09%
#rm(list = ls())  # Clean up

##Example A: Textbook
# The time it takes a student to complete the mid-term for Algebra II
#is a bi-modal distribution with mean = 1 hr and sd= 1 hr. During the
#month of June, Professor Spence administers the test 64 times.
#What is the probability that the average mid-term completion time
#for students during the month of June exceeds 48 minutes?

  pop_mean<-1
  pop_sd<-1
  sample_size<-64
  sample_mean<-48/60
  
  zscore<-(sample_mean-pop_mean)/(pop_sd/sqrt(sample_size))
  zscore
  #zscore = -1.6
  
##prnom gives you the percentage completed the exam greater than 48 minutes
1-pnorm(zscore) #area above the curve
pnorm(zscore)#less than 48 minutes--area below the curve
#rm(list = ls())  # Clean up

# Question 3
# 
# A very large company has its headquarters in a 15-story downtown office
# building. The morning commute time for employees of this company is normally
# distributed with a mean of 28 minutes and a standard deviation of 11 minutes.
# 
# The company in the building next door samples 23 of its employees
# and finds that their mean commute time is 35.1 minutes. Is there evidence
# that their commute time is longer than the other company's, or is this
# just random sampling error?
# 
# Use the Central Limit Theorem to determine if this sample mean is likely
# to be observed, assuming commute time is the same for both companies.


  pop_mean<-28
  pop_sd<-11
  sample_size<-23
  sample_mean<-35.1
  
  zscore<-(sample_mean-pop_mean)/(pop_sd/sqrt(sample_size))
  zscore
#zscore = 3.095491
  
##prnom gives you the probability
1-pnorm(zscore) #area above the curve
pnorm(zscore)#area below the curve



#3a. What is the expected mean in minutes of the sampling distribution
#for samples of size n=23? (Report as a whole number)
#SUMMARY: The mean of a sampling distribution is the mean of the population. Formally: μx = μ
pop_mean

#3b. What is the expected standard error of the sampling distribution
## [1] 1.5
#for samples of n=23? (Round to 2 decimal places.)
##standard error of the mean as standard deviation/squareroot(sample size)
expected_standard_error<- pop_sd/sqrt(sample_size)
round(expected_standard_error,2)
## [1] 7.9632 8.2768
## [1] 7.96 8.28
#3c. What is the z-score for the neighboring company's sample mean?
#(Round to 1 decimal place.)
zscore<-(sample_mean-pop_mean)/(pop_sd/sqrt(sample_size))
round(zscore,1)


#3d. What is the probability of observing a sample mean this high
#(or higher), if the employees really do commute the same amount of time?
round(1-pnorm(zscore),3) #area above the curve
#Less than .001

#3e. What should we conclude about the sample mean of 35.1 minutes?
#It is a sample mean that is not likely to be observed.

#3f. What should we conclude about the commute time of the employees
#in the building next door?
#Their average commute time is probably NOT the same as the large company next door (our original population).

#3g. What must we assume about the 23 people that were sampled for our conclusion to be valid?
#They must be a representative sample of employees at the company.

#rm(list = ls())  # Clean up


#Question 4

# Dixie Queen uses an automatic ice cream dispenser to fill pint-sized
# containers of ice cream. The company that makes the dispenser says
# the volume it dispenses into each container follows a normal distribution
# with σ= 1.5 ml.
# 
# The Dixie Queen manager randomly selected 15 ice cream pints and found
# that the average volume was 471.46 ml. She wants to know if her machine
# is performing as expected.
# 
  pop_mean<-471.46
  pop_sd<-1.5
  sample_size<-15
  sample_mean<-471.46
  
  zscore<-(sample_mean-pop_mean)/(pop_sd/sqrt(sample_size))
  zscore
  #zscore = 
  
  ##prnom gives you the probability
  1-pnorm(zscore) #area above the curve
  pnorm(zscore)#area below the curve

# 4a. What is the expected variability in sample means of size n=15?
# (Find the standard error and round to 3 decimal places.)
expected_standard_error<- pop_sd/sqrt(sample_size)
round(expected_standard_error,3)

#4b. What is the margin of error, assuming 95% confidence?
#(Round to 3 decimal places and use this value in the following
#calculations.)

criticalValue<-1.96 #critical value for 95% confidence
expected_standard_error #bring back standard error of the mean

marginError<-criticalValue*expected_standard_error
round(marginError,3)
#Answer: .759

#4c. Find the 95% confidence interval for the mean volume for this sample of 15 randomly selected ice cream pint containers.  

z<-1.96 #for 95% confidence
upperBound<-sample_mean+1.96*(pop_sd/sqrt(sample_size)) #to the right of the mean
lowerBound<-sample_mean-1.96*(pop_sd/sqrt(sample_size))#to the left of the mean
round(print(c(lowerBound, upperBound)),3)

#What are the bounds of the 95% confidence interval?
round(print(c(lowerBound, upperBound)),1)

#4d. A pint is equivalent to 473.20 ml. Do you think the dispenser
#is working as reported?
#No.

##############Week 2: Hypothesis Testing (One Group Means)#################
#               One Tailed  Two Tailed
# Z value (90%)   1.28        1.645
# 
# Z value (95%)   1.645       1.96
# 
# Z value (99%)   1.96        2.575


#Example A

# We have a medicine that is being manufactured and each pill is
# supposed to have 14 milligrams of the active ingredient. What
# are our null and alternative hypotheses?
            #This is a two-tail test
            # H0 :μ=14 
            # Ha :μ̸=14
#Our null hypothesis states that the
#population has a mean equal to 14 milligrams.
#Our alternative hypothesis states that the population
#has a mean that is different than 14 milligrams.


# To reject the null hypothesis means to find a large enough difference
# between your sample mean and the hypothesized (null) mean that it raises
# real doubt that the true population mean is 20. If the difference between
# the hypothesized mean and the sample mean is very large, we reject the null
# hypothesis. If the difference is very small, we do not.

###############Two-tail Test#################
#Hypothesis that college freshmen study 20 hours per week
          # H0 :μ=20
          # Ha :μ̸=20

# The examples above are all two-tailed hypothesis tests. We indicate
# that the average study time is either 20 hours per week, or it is not.
## [1] 0.6625387
# Computer use averages 3.2 hours per week, or it does not. 
# We do not specify whether we believe the true mean to be higher
# or lower than the hypothesized mean. We just believe it must be different.

# In a two-tailed test, you will reject the null hypothesis if your sample
# mean falls in either tail of the distribution. For this reason, the alpha
# level (let’s assume .05) is split across the two tails. The curve below
## [1] "We fail to reject the null hypothesis (Ho)"
# shows the critical regions for a two-tailed test.These are the regions
# under the normal curve that,together,sum to a probability of 0.05.
# Each tail has a probability of 0.025. The z-scores that designate
# the start of the critical region are called the critical values

# If the sample mean taken from the population falls within these critical
# regions, or "rejection regions," we would conclude that there was too
# much of a difference and we would reject the null hypothesis. However,
# if the mean from the sample falls in the middle of the distribution
# (in between the critical regions) we would fail to reject the null
# hypothesis.


###############One-Tailed Hypothesis Test###############

# We would use a single-tail hypothesis test when the direction
# of the results is anticipated or we are only interested in one
# direction of the results. For example, a single-tail hypothesis
# test may be used when evaluating whether or not to adopt a new
# textbook. We would only decide to adopt the textbook if it
# improved student achievement relative to the old textbook.

# When performing a single-tail hypothesis test, our alternative
# hypothesis looks a bit different. We use the symbols of greater
# than or less than. For example, let’s say we were claiming that
# the average SAT score of graduating seniors was GREATER than 1,110.
# Remember, our own personal hypothesis is the alternative hypothesis.
# Then our null and alternative hypothesis could look something like:
# H0 :μ≤1100 Ha :μ>1100

# A single-tail hypothesis test also means hat we have only one critical
# region because we put the entire critical region into just one side of
# the distribution. When the alternative hypothesis is that the sample
# mean is greater, the critical region is on the right side of the
# distribution

df   <- 25
p    <- 0.05
gg   <- data.frame(x=seq(5,50,0.1))
gg$y <- dchisq(gg$x,df)

library(ggplot2)
ggplot(gg) + 
  geom_path(aes(x,y)) +
  geom_linerange(data=gg[gg$x>qchisq(p,df,lower.tail=F),],
                 aes(x, ymin=0, ymax=y),
                 colour="red")
#rm(list = ls())  # Clean up

# Type I and Type II Errors
# Remember that there will be some sample means that are extremes
# – that is going to happen about 5% of the time, since 95% of all
# sample means fall within about two standard deviations of the mean.
# What happens if we run a hypothesis test and we get an extreme sample mean?
## [1] -2.171315
# It won’t look like our hypothesized mean, even if it comes from that
# distribution. We would be likely to reject the null hypothesis. But
# we would be wrong.
## [1] -2.171315
# 
# When we decide to reject or not reject the null hypothesis,
## [1] FALSE
# we have four possible scenarios:
## [1] FALSE
# a. A true hypothesis is rejected. type 1 error
# b. A true hypothesis is not rejected.
# c. A false hypothesis is not rejected. type 2 error
# d. A false hypothesis is rejected.
# 
# Lesson Summary
#     a. Hypothesis testing involves making educated guesses about a population 
## [1] "We fail to reject the null hypothesis (Ho)"
#     based on a sample drawn from the population. We generate null and
#     alternative hypotheses based on the mean of the population to test
#     these guesses.
#     
#     b. We establish critical regions based on level of significance or
#     alpha (α) levels. If the value of the test statistic falls in
## [1] -2.17
#     these critical regions, we are able to reject it.
#     
## [1] 2.093
#     c. When we make a decision about a hypothesis, there are
#     four different outcome and possibilities and two different
#     types of errors. A Type I error is when we reject the null
#     hypothesis when it is true and a Type II error is when we do
#     not reject the null hypothesis, even when it is false.
#     
    
#Review
    
    # If the difference between the hypothesized population mean
    # and the mean of the sample is large, we reject the null hypothesis.
    # If the difference between the hypothesized population mean and the
    # mean of the sample is small, we fail to reject the null hypothesis.
    
    

# Hypothesis Tests and Their Tails
# There are three types of test from a “tails” standpoint:
# • A left-tailed test only has a tail on the left side of the graph:
# •  A right-tailed test only has a tail on the right side of the graph:
# • A two-tailed test has tails on both ends of the graph. This is a test where the null hypothesis is a claim of a
# specific value. For example: H0 : X = 5

# 
# Critical values are values separating the values that
## [1] 6
# support or reject the null hypothesis.
# 
# Critical regions are the areas under the distribution
## [1] 22
# curve representing values that support the null hypothesis.

#The critical value for a 95% confidence level is Z = +/−1.96.
## [1] -1.75
# Example A
# A researcher claims that black horses are, on average, more than 30 lbs heavier than white horses, which average 1100 lbs. What is the null hypothesis, and what kind of test is this?
# 
# Solution A
## [1] -1.9432
# The null hypothesis would be notated H0 : μ ≤ 1130 lbs
## [1] TRUE
# This is a right-tailed test, since the tail of the graph would be on the right. Recognize that values above 1130 would
# indicate that the null hypothesis be rejected, and the red area represents the rejection region.Example B
# A package of gum claims that the flavor lasts more than 39 minutes. What would be the null hypothesis of a test to determine the validity of the claim? What sort of test is this?
# 
# Solution B
# The null hypothesis would by notated as H0 : μ ≤ 39.
# This is a right-tailed test, since the rejection region would consist of values greater than 39.


# Example C
# An ice pack claims to stay cold between 35 and 65 minutes.
# What would be the null hypothesis of a test to determine the validity
# of the claim? What sort of test would it be?
# 
# Solution C
# The null hypothesis would be H0 : 35 > μ∪μ > 65.
# This is a two-tailed test, since the null hypothesis contains
# the values above and below a given range.
## Observations: 58
## Variables: 44
## $ Rider              <chr> "Joao Ricardo Vieira", "Matt Triplett", "J....
## $ Rank15             <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ...
## $ Country            <chr> "BRA", "USA", "USA", "BRA", "USA", "BRA", "...
## $ YearBorn           <int> 1984, 1991, 1987, 1994, 1990, 1979, 1982, 1...
## $ Height             <int> 66, 67, 70, 68, 73, 72, 70, 67, 68, 70, 70,...
## $ Weight             <int> 163, 160, 140, 145, 160, 170, 180, 150, 135...
## $ YearsPro           <int> 3, 4, 10, 2, 6, 9, 16, 7, 9, 10, 8, 14, 9, ...
## $ Events14           <int> 28, 28, 22, 1, 15, 28, 28, 28, 12, 14, 12, ...
## $ BuckOuts14         <int> 93, 86, 63, 1, 41, 81, 90, 92, 30, 37, 33, ...
## $ Rides14            <int> 41, 33, 25, 0, 17, 29, 41, 50, 8, 16, 9, 32...
## $ CupPoints14        <dbl> 9520.25, 7493.58, 4973.50, 1671.87, 3240.25...
## $ Rank14             <int> 2, 3, 4, 0, 31, 17, 8, 1, 54, 9, 29, 7, 5, ...
## $ RidePer14          <dbl> 0.4409, 0.3837, 0.3968, 0.0000, 0.4146, 0.3...
## $ RidesPer_45bull_14 <dbl> 0.0000, 0.2000, 0.5000, 0.0000, 0.0000, 0.0...
## $ Rides90pts_14      <int> 5, 2, 4, 0, 0, 1, 2, 1, 0, 2, 0, 0, 1, 0, 0...
## $ Wins14             <int> 2, 0, 2, 0, 1, 2, 1, 0, 0, 0, 0, 2, 3, 3, 0...
## $ Top5_14            <int> 10, 4, 5, 0, 1, 5, 6, 7, 1, 2, 1, 7, 8, 5, ...
## $ Top10_14           <int> 14, 11, 5, 0, 3, 8, 15, 12, 2, 4, 4, 9, 11,...
## $ FinalPoints14      <dbl> 1152.75, 2743.25, 4553.50, 0.00, 83.50, 0.0...
## $ Earnings14         <dbl> 328120.96, 258025.80, 497597.99, 32977.83, ...
## $ Events13           <int> 22, 9, 26, 0, 24, 19, 27, 27, 11, 0, 9, 27,...
## $ BuckOuts13         <int> 72, 26, 90, 0, 66, 53, 84, 91, 26, 0, 18, 8...
## $ Rides13            <int> 35, 11, 47, 0, 22, 25, 39, 50, 6, 0, 2, 24,...
## $ CupPoints13        <dbl> 8748.73, 2827.43, 10399.25, 0.00, 4304.41, ...
## $ Rank13             <int> 3, 26, 1, 0, 25, 16, 4, 2, 42, 0, 40, 10, 1...
## $ RidePer13          <dbl> 0.4861, 0.4231, 0.5222, 0.0000, 0.3333, 0.4...
## $ RidesPer_45bull_13 <dbl> 0.3750, 1.0000, 0.2667, 0.0000, 0.0000, 0.0...
## $ Rides90pts_13      <int> 4, 1, 8, 0, 0, 0, 0, 3, 0, 0, 0, 1, 1, 0, 1...
## $ Wins13             <int> 3, 0, 5, 0, 0, 0, 2, 3, 0, 0, 0, 1, 2, 1, 0...
## $ Top5_13            <int> 5, 1, 11, 0, 2, 6, 5, 11, 1, 0, 0, 2, 3, 4,...
## $ Top10_13           <int> 9, 2, 14, 0, 2, 6, 11, 13, 2, 0, 2, 6, 7, 6...
## $ FinalPoints13      <dbl> 1990.75, 1257.25, 5296.25, 0.00, 0.00, 435....
## $ Earnings13         <dbl> 466585.11, 89377.51, 1810710.75, 0.00, 5857...
## $ Events12           <int> 0, 0, 26, 0, 0, 28, 29, 29, 5, 0, 0, 20, 23...
## $ BuckOuts12         <int> 0, 0, 82, 0, 0, 87, 87, 103, 10, 0, 0, 64, ...
## $ Rides12            <int> 0, 0, 40, 0, 0, 53, 50, 62, 3, 0, 0, 33, 38...
## $ CupPoints12        <dbl> 0.00, 0.00, 9273.25, 0.00, 0.00, 10608.25, ...
## $ Rank12             <int> 0, 0, 8, 0, 0, 4, 2, 1, 53, 0, 0, 13, 10, 1...
## $ RidePer12          <dbl> 0.00, 0.00, 0.49, 0.00, 0.00, 0.61, 0.57, 0...
## $ Wins12             <int> 0, 0, 3, 0, 0, 1, 2, 2, 0, 0, 0, 1, 2, 1, 0...
## $ Top5_12            <int> 0, 0, 8, 0, 0, 10, 9, 8, 0, 0, 0, 5, 5, 4, ...
## $ Top10_12           <int> 0, 0, 13, 0, 0, 14, 12, 18, 1, 0, 0, 7, 11,...
## $ FinalPoints12      <dbl> 0.00, 0.00, 287.00, 0.00, 0.00, 559.50, 251...
## $ Earnings12         <dbl> 0.00, 0.00, 313340.27, 0.00, 0.00, 208724.5...
################One sample t test################
# If we reject the null hypothesis we are saying that the
# difference between the observed sample mean and the
# hypothesized population mean is too great to be attributed
# to chance. When we fail to reject the null hypothesis, we
# are saying that the difference between the observed sample mean
# and the hypothesized population mean is probable if the null
# hypothesis is true. Essentially, we are willing to attribute
# this difference to sampling error.
# 
# Conducting a Hypothesis Test on One Sample Mean When the

# Population Parameters are Known
#   Although this is rarely the case, we can use our familiar
## [1] 18 22 25 27 33
#   z-statistic to conduct a hypothesis test on a single sample
#   mean. In short, we find the z-statistic of our sample mean
#   in the sampling distribution and determine if that z-score
#   falls within the critical (rejection) region or not. This
## 
##  One Sample t-test
## 
## data:  age
## t = -9.879, df = 57, p-value = 5.863e-14
## alternative hypothesis: true mean is not equal to 30
## 95 percent confidence interval:
##  23.77915 25.87603
## sample estimates:
## mean of x 
##  24.82759
#   test is only appropriate when you know the true mean and
#   standard deviation of the population.


#Example A
# The school nurse thinks the average height of 7th graders
# has increased. The average height of a 7th grader five years
# ago was 145 cm with a standard deviation of 20 cm. She takes
# a random sample of 200 students and finds that the average
# height of her sample is 147 cm. Are 7th graders now taller
# than they were before? Conduct a single-tailed hypothesis
# test using a .05 significance level to evaluate the null
## 
##  One Sample t-test
## 
## data:  age
## t = -9.879, df = 57, p-value = 2.931e-14
## alternative hypothesis: true mean is less than 30
## 95 percent confidence interval:
##      -Inf 25.70302
## sample estimates:
## mean of x 
##  24.82759
# and alternative hypotheses.

pop_mean<-145
pop_sd<-20
sample_size<-200
sample_mean<-147

#1st step: Develop null and alternative hypotheses
#H0 :μ≤145
#Ha :μ>145

#2nd step: Choose. α = .05 Determine critical value.
#A one-tail test with a=.05 would have 95%
#of the area under the curve outside the critical region
#We use a reference (Ztable) to find the zscore for
#.95 which is between 1.64 (look for .99 )
#This is a one-tailed test, and a z-score of 1.64
#cuts off 5% in the single tail. Any test statistic
## [1] 163 160 140 145 160 170
#greater than 1.64 will be in the rejection region.
zscore<-1.64 

#3rd step: Calculate test statistic for the sample of 7th graders
t_test<-(sample_mean-pop_mean)/(pop_sd/sqrt(sample_size))
t_test


if(t_test > zscore){
  print("We will reject the null hypothesis. The true mean is the pop_mean")
} else {
    print("We fail to reject the null hypothesis (Ho)")
} 



#The calculated z−score of 1.414 is smaller than 1.64 and
#thus does not fall in the critical region. Our decision
#is to fail to reject the null hypothesis and conclude
#that the probability of obtaining a sample mean equal
#to 147 is likely to have been due to chance.
#rm(list = ls())  # Clean up


# Example B

# A farmer is trying out a planting technique that he
## 
##  One Sample t-test
## 
## data:  bull$Weight
## t = -19.631, df = 57, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 190
## 95 percent confidence interval:
##  150.1379 157.5173
## sample estimates:
## mean of x 
##  153.8276
# hopes will increase the yield on his pea plants.
# The average number of pods on one of his pea plants
# is 145 pods with a standard deviation of 100 pods.
# This year, after trying his new planting technique,
# he takes a random sample of his plants and finds the
# average number of pods to be 147. He wonders whether
# or not this is a statistically significant increase.
# What are his hypotheses and the test statistic?
# 
#   1. First, we develop our null and alternative
#   hypotheses:
#   H0 :μ≤145
#   Ha :μ>145
## [1] 153.1081
# 
## [1] 13.02302
#     2. Second step: Choose alpha. α = .05 Determine
#       critical value.
#     This is a one-tailed test, and a z-score of 1.64
#     cuts off 5% in the single tail. Any test statistic
      #greater than 1.64 will be in the rejection region.
      a<-1.64
## 
##  One Sample t-test
## 
## data:  USA$Weight
## t = -17.231, df = 36, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 190
## 95 percent confidence interval:
##  148.7660 157.4502
## sample estimates:
## mean of x 
##  153.1081
# 
#     3. 3rd step: Calculate test statistic
    pop_mean<-145
    pop_sd<-100
    sample_size<-#the farmer took a random unknown sample
    sample_mean<- 147#this is a sample mean, not an individual observation
    
    sem<-pop_sd/sqrt(sample_size)#standard error of the mean as standard deviation/squareroot(sample size)
    z_test<-(sample_mean-pop_mean)/sem

    z_test

    z_test2<-(sample_mean-pop_mean)/(pop_sd/sqrt(sample_size))#or
    z_test2
## [1] 2.140972
if(z_test > a){
  print("We will reject the null hypothesis. The true mean is the pop_mean")
} else {
    print("We fail to reject the null hypothesis (Ho)")
} 
    #We will reject the null hypothesis if the
    #test statistic is greater than 1.645.
    #We fail to reject H0.
    #rm(list = ls())  # Clean up
   

    
#########Reporting P-Values as an Alternative to Looking at Critical Regions############
# We can also evaluate a hypothesis by asking,
# “What is the probability of obtaining the value of
# the test statistic we did if the null hypothesis is true?”
# This is called the p− value.
## [1] 153.11
###########Week 2: Videos Comprehension##################
#Hypothesis Testing
## [1] 13.02
#1. In statistical inference, measurements are made on a ________, and generalizations are made to a _________.
#Sample, population
## 
##  One Sample t-test
## 
## data:  USA$Weight
## t = -17.231, df = 36, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 190
## 95 percent confidence interval:
##  148.7660 157.4502
## sample estimates:
## mean of x 
##  153.1081
#2. Which hypothesis depicts the research hypothesis--the statement that we hope to demonstrate is true?
#alternative hypothesis

#3a. Truth: The drug is ineffective. The test on your sample leads you to conclude that it is effective.
#Type 1 error
## [1] 148.8
#3b. Truth: The drug reduces allergies. The test on your sample leads you to conclude that it is effective.
#correct conclusion
## [1] 157.5
#3c. Truth: the drug is ineffective. The test on your sample leads you to conclude that it is ineffective.
#Correct conclusion

#3d. Truth: the drug does reduce allergies. The test on your sample leads you to conclude that it does not reduce allergies.
#Type 2 error

#Alpha Levels, Critical Values, and P-Values

#Comprehension Check

#Single Sample z-test
# Test some value to a null
#z=x(bar)- mean/standard deviation

# 1. A medical researcher compares a new treatment for poison
# ivy against a traditional ointment. He finds that the new
# treatment significantly reduces itching, with a p-value of 0.047.
# 
#Answer: It is the likelihood of observing this reduction in itching if there really
#is no difference between the two treatments

# 2. Aviation experts fear that pilots are being asked to fly
# longer than is recommended by national guidelines.
# Current FAA regulations for domestic flights generally
# limit pilots to eight hours of flight time during a 24-hour
# period. FAA administrators conduct an analysis using a large
# sample of flight records for domestic flights in the past year.
#Ho: \mu\le8\:hours\text{, and }  Ha:  \mu>8\:hours
#This is a right-tailed test, since the rejection region
#would consist of values greater than 8.

#2b. If the FAA administrators want to be 95% confident
#in the result of their hypothesis test, what value of α
#should they set?
#\alpha=0.05

#2c. Identify the critical z-value(s) for this problem.
#A 95% confidence level means that a total of 5% of the
#area under the curve is considered the critical region.
#browseURL("http://www.stat.ufl.edu/~athienit/Tables/Ztable.pdf")
# If α = 0.05, then the area under the curve representing H1,
# the alternative hypothesis, would be 95%, since
# α (alpha) is the same as the area of the rejection region.
# Using the Z-score reference table above, we find that the
# Z-score associated with 0.9500 is approximately 1.6.according to
#z table
#z=1.64

#2d. The average flight time for the pilots is found to be
#8.12 hours. The standard deviation reported by the FAA
#is 0.72 hours, and there were 81 pilots in the sample.
#What is the z-statistic for this hypothesis test?
#z = (x - μ) / SEmean
  pop_mean<-8
  pop_sd<-.72
  sample_size<-81
  sample_mean<- 8.12#this is a sample mean, not an individual observation
    
  sem<-.72/sqrt(81)#standard error of the mean as standard deviation/squareroot(sample size)
  ztest<-(sample_mean-pop_mean)/sem
  ztest
  
  #confidence interval 95%
  upperBound<-sample_mean+1.96*(pop_sd/sqrt(sample_size)) #to the right of the mean
  lowerBound<-sample_mean-1.96*(pop_sd/sqrt(sample_size))#to the left of the mean
  round(print(c(lowerBound, upperBound)),2)
#Answer: 1.50
  
  
#2e. Is the z-statistic in the critical region?
#No
  
#2f. What should be the conclusion of this test?
  #Fail to reject the null hypothesis. 
  
  
# The p-values is the likelihood of observing that particuluar sample
# value if the null hypothesis were true. Therefore, if the p-value
# is smaller than your significance level, you can reject the null
# hypothesis.

#Students t-distribution
  #utilizes "Degrees of freedom"
  #1. Better Estimate:
    #sd=sqrt(sigma(x-xbar 2nd power)/ n-1)
  #2. Use the t- distribution
  #As sample size increases "t" begins to look like "z"

  #T-test formula definitions:  
          # t is the test statistic and has n − 1 degrees of freedom.
          # x ̄ is the sample mean
## [1] 0.3346643
          # μ0 is the population mean under the null hypothesis.
## [1] 0.1065763
          # s is the sample standard deviation
          # n is the sample size s
          # √n is the estimated standard error

#To utilize the t-distribution
  #1st step: determine degrees of freedom--# of observations
  #that are free to vary
  #Example:
  #10 observations
  #9 degrees of freedom
## 
##  One Sample t-test
## 
## data:  events2014$RidePer14
## t = -10.054, df = 41, p-value = 1.253e-12
## alternative hypothesis: true mean is not equal to 0.5
## 95 percent confidence interval:
##  0.3014528 0.3678758
## sample estimates:
## mean of x 
## 0.3346643
#As our n increases the t-distribution become more normal
  
#How do we know when to use it?
  #ONE RULE: DO WE KNOW THE SIGMA? THE STANDARD DEVIATION?
  #if we don't know sigma, we must use the t-distribution.
  #if we do know sigma, we get to use the normal distribution
## [1] 0.335
#   Assumptions of the single sample t-test:
# 
## [1] 0.107
# • A random sample is used.
# • The random sample is made up of independent observations
# • The population distribution must be nearly normal, or
#   the size of the sample is large.
## 
##  One Sample t-test
## 
## data:  events2014$RidePer14
## t = -10.054, df = 41, p-value = 1.253e-12
## alternative hypothesis: true mean is not equal to 0.5
## 95 percent confidence interval:
##  0.3014528 0.3678758
## sample estimates:
## mean of x 
## 0.3346643
#Comprehension Check: The T Distribution 
## [1] -10.05
  #1. Here is the formula for calculating the t-statistic for a single sample:
  #t=sample mean - hypothesized mean/standard error
  #or t<-sample_mean - hypo_mean/sem
## [1] 41
#1a. Which part of the formula tells you how far your sample
  #mean is from the center of the sampling distribution?
  #Numerator
  
#1b. Which part of the formula tells you how far
  #a sample mean is expected to be from the center of the distribution?
  #Denominator
  
#2. Which of the following is true of the t-distribution?
  #Increasingly resembles the normal distribution as degrees of freedom increase.
  #Assumes the population is normally distributed
  #It has a greater spread than the normal distribution.
  #All of the above
  
#Single Sample T-Test
  
# Example  
# The high school athletic director is asked if football players
# are doing as well academically as the other student athletes.
# We know from a previous study that the average GPA for the student
# athletes is 3.10. After an initiative to help improve the GPA of
# student athletes, the athletic director randomly samples 20
# football players and finds that the average GPA of the sample
# is 3.18 with a sample standard deviation of 0.54. Is there a
# significant improvement? Use a 0.05 significance level.

# 
#    Hypothesis Step 1: Cleary state the null and alternative hypotheses.
#     H0 :μ=3.10
#     Ha :μ̸=3.1
        pop_mean<-3.10
        sample_size<-20
        sample_mean<- 3.18 #this is a sample mean, not an individual observation
        sample_sd<-.54 #We don't know the population standard deviation.
        a<-.05
#     
#    Hypothesis Step 2: Identify the appropriate significance level and confirm

#   the test assumptions.
#    We need a table of tabulated t-values for significance level
    #and degrees of freedom
    #browseURL("https://www.stat.wisc.edu/courses/st371-hanlon/hand/table/tTable.pdf") 
#    We use our t-test formula:
#     We know that we have 20 observations, so our degrees of 
#     freedom for this test is 19. Nineteen degrees of
#     freedom at the 0.05 significance level gives us a critical
#     value of ± 2.093. (.025 on the upper and .025 on the lower)
      degreesFreedom<-sample_size-1    
      critical_value<-2.093
# 
#     Hypothesis Step 3: Analyze the data: 
      se_sample<-sample_sd/sqrt(sample_size)
      #t-test
      t_test<-(sample_mean-pop_mean)/(sample_sd/sqrt(sample_size))
      #or in the alternative
      t_test2<-(sample_mean-pop_mean)/se_sample

      t_test
      
# Hypothesis Step 4: Interpret your results
  if(t_test > critical_value){
  print("We will reject the null hypothesis.")
} else {
    print("We fail to reject the null hypothesis (Ho)")
} 
    #We will reject the null hypothesis if the
    #test statistic is greater than 1.645.
    #We fail to reject H0.
    #rm(list = ls())  # Clean up
## [1] 8.77
 # Since our calculated t-test value is lower than our t-critical value,
 # we fail to reject the Null Hypothesis. Therefore, the average GPA of
 # the sample of football players is not significantly different from
 # the average GPA of student athletes. Therefore, we can conclude that
 # the difference between the sample mean and the hypothesized value is
 # not sufficient to attribute it to anything other than sampling error.
 # Thus, the athletic director can conclude that the mean academic
## 
##  One Sample t-test
## 
## data:  bull$earnings_per_log
## t = -0.82263, df = 28, p-value = 0.4177
## alternative hypothesis: true mean is not equal to 8.85
## 95 percent confidence interval:
##  8.572169 8.968618
## sample estimates:
## mean of x 
##  8.770394
 # performance of football players does not differ from the mean
 # performance of other student athletes.
## [1] 8.57
#1. A necessary condition for a one-sample t-test is that
#the sample must consist of "independent" observations.
## [1] 9.12
#What does this mean?
#Answer: There must be no predictable relationship between one
#subject's score and any other subject's score
  
#2. Researchers are interested in whether or not the average person consumes
#2,000 calories per day. Their random sample of 25 people consumed an average
#of 1,891 calories, with a standard deviation of 251 calories.  
      
# Hypothesis Step 1: Cleary state the null and alternative hypotheses.
#     H0 :μ= 2000
## [1] 5282.575
#     Ha :μ̸ =2000
## [1] 5283
        pop_mean<-2000
        sample_size<-25
        sample_mean<-1891  #this is a sample mean, not an individual observation
## [1] 9136.202
        sample_sd<-251 #We don't know the population standard deviation.
#     
#   Hypothesis Step 2: Identify the appropriate significance level and confirm
#   the test assumptions.
#   We need a table of tabulated t-values for significance level
    #and degrees of freedom
    #browseURL("https://www.stat.wisc.edu/courses/st371-hanlon/hand/table/tTable.pdf") 
#     We use our t-test formula:
#     We know that we have 25 observations, so our degrees of 
#     freedom for this test is 24. 24 degrees of
#     freedom at the 0.05 significance level gives us a critical
#     value of 1.711 or ± 2.064 (.025 on the upper and .025 on the lower)
## [1] 8.572169
      degreesFreedom<-sample_size-1    
      critical_value1<-2.064
      critical_value2<-(-2.064)
# 
## [1] 9.12
#     Hypothesis Step 3: Analyze the data: 
      se_sample<-sample_sd/sqrt(sample_size)
      #t-test
      t_test<-(sample_mean-pop_mean)/(sample_sd/sqrt(sample_size))
      t_test
      #or in the alternative
      t_test2<-(sample_mean-pop_mean)/se_sample
      t_test2
      
  t_test > critical_value1
  t_test > critical_value2
  
# Hypothesis Step 4: Interpret your results
  if(t_test > critical_value1){
  print("We will reject the null hypothesis.")
} else {
    print("We fail to reject the null hypothesis (Ho)")
} 
    #We will reject the null hypothesis if the
    #test statistic is greater than 2.064
    #We fail to reject H0.
## [1] 28.81
## [1] 0.43
#2a. What is the t-statistic? (Round to 2 decimal places.)
  round(t_test,2)
#2b. What is the absolute critical t value, assuming α=0.05?
  critical_value
  
#2c. What should the researchers conclude?
#People do not consume an average of 2,000 calories per day. correct
#rm(list = ls())  # Clean up  
  
#3. Scientists fear that polar bears are slowly
#starving due to their shrinking habitat. A
#healthy male polar bear weighs about 900 pounds.
#A new expedition was able to estimate the weight
#of 7 male polar bears. They found an average weight
## [1] 2.04
#of 861 lbs with a standard deviation of 59 pounds.
## 
##  One Sample t-test
## 
## data:  chips
## t = 2.0761, df = 7, p-value = 0.07652
## alternative hypothesis: true mean is not equal to 28.5
## 95 percent confidence interval:
##  28.45658 29.16842
## sample estimates:
## mean of x 
##   28.8125
  pop_mean<-900
  sample_size<-7
  sample_mean<-861  #this is a sample mean, not an individual observation
  sample_sd<-59 #We don't know the population standard deviation.
    

#3a. What is the alternative hypothesis for this test?
  #left-tail
  #H0: mu>= 900
  #Ha: mu<900

#3b. How many degrees of freedom are there?
## [1] 2.365
degreesFreedom<-print(sample_size-1)
  
#3c. What is the value of the standard error? (Round to 1 decimal place.)
standard_error<-print(round(sample_sd/sqrt(sample_size)),1)

#3d. What is the t-statistic? (Round to 3 decimal places.)
t_test<-(sample_mean-pop_mean)/(sample_sd/sqrt(sample_size))
print(t_test,3)

#3e. What is the t-critical value, assuming α=0.05?
#6 degrees of freedom
#lower-tail probability .05
 critical_value<-print(-1.9432)
 t_test > critical_value

#We will reject the null hypothesis if the test statistic is greater than 1.9432
#It is not, it is -1.75
#We fail to reject H0.

#3f. Do you reject the null hypothesis?
 #No.
 
#3g. What should the researchers conclude?
#There is no evidence that polar bears weigh less than 900 lbs on average.
 
#rm(list = ls())  # Clean up  

#############Week 2: Hypothesis Testing (one group means) R Tutorial Videos##########

#import dataset
bull<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/f544a6f73ec30abf4c6675772ae5f8c7/asset-v1:UTAustinX+UT.7.11x+2T2017+type@asset+block/BullRiders.csv")

#1a. How many observations are in the dataset?
glimpse(bull)

#So suppose you want to test a claim that the mean age of bull riders
#in this season was 30 years old.

# H0 :μ=30 
# Ha :μ̸=30
#A two-tail test
## [1] 1.67
age<-2012 -bull$YearBorn

hist(age)

fivenum(age)

#t test in R. Give it the vector of values.
#Can also give it the value in our null hypothesis. The pop mean.
t.test(age, mu=30)
## [1] -1.711
## [1] FALSE
#we get our t-statistic
#degree of freedom (n-1)
#and a p value very close to 0
#We would reject the null hypothesis that the population mean = 30
## [1] 1.67
#It gives us a 95% confidence interval
#It also gives you the alternative hypothesis for
# 2 sided hypothesis. It's the default.
## [1] -1.711
#if we want a one-sided test add the alternative option
#If we think the mean age of bullrider is less than 30
t.test(age, mu=30, alternative = "less")

#############Week 2: Pre-lab##########

#Primary Research Question

#import dataset
bull<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/f544a6f73ec30abf4c6675772ae5f8c7/asset-v1:UTAustinX+UT.7.11x+2T2017+type@asset+block/BullRiders.csv")

#The average American adult man weighs
#190 pounds.  Do professional bull riders from the
#US weigh the same?

# H0 :μ=190
# Ha :μ̸=190
#A two-tail test

#1a. Which variable tells us how much the rider weighs? The variable name in the dataset is:
head(bull$Weight)

#1b. What type of variable is this?
#Quantitative

# Reflect on the Method
#Which method should we be using for this analysis
#and why?

# 2a. We will use a one-sample t-test to help
#us answer this lab question. Why?
#ANSWER: We want to compare the average weight of these bull riders to a claimed value.

#2b. We want to test a hypothesis that professional
#bull riders weigh 190 pounds on average. What will
#the null hypothesis look like for this one-sample
## [1] -2.200985
#t-test?
#ANSWER: μ=190 pounds

#2c. The formula to calculate a t-statistic is below.
## [1] 1.796
#What does the denominator of this test tell us?
#t=(x¯-μ)/SE?
#the difference you would expect, based on chance alone correct

#Example t.test
hist(bull$Weight)
t.test(bull$Weight, mu = 190)
## [1] 39.85231 45.34769
## [1] 39.85 45.35
# H0 :μ=190
# Ha :μ̸=190
#We reject the null hypothesis.
## [1] 39.85
#Breakdown Your Analysis
## [1] 45.35
#Here is the code you will use:

#Select bull riders from the US
USA <-bull[bull$Country=="USA",]

# Summarize the bull rider weights
mean(USA$Weight)
## [1] 39.23252 45.96748
## [1] 39.23 45.97
sd(USA$Weight)

# Visualize the weight distribution
hist(USA$Weight, main='Histogram of US Bull Rider Weights',xlab='Weight (lbs)')

# Run the single sample t-test
t.test(USA$Weight, mu=190)

#1. What type of graph are we going to use to visualize the weights of the bull-riders?
#Histogram

#2. What portion of the code defines the value of the null hypothesis?
#mu=190
## [1] -2.079614
#3. Which assumption can we confirm with the use of the following line of code:
hist(USA$Weight, main='Histogram of US Bull Rider Weights', xlab='Weight (lbs)')
#Normality

#4. If you wanted to calculate the standard error for this sample of 37 riders, what additional line of code would you need to add?
## [1] 72.09622 32.10378
sd(USA$Weight)/sqrt(37)

#5. What is the cause of the error in the code below?

#bull <- BullRiders
#hist(bull$YearBorn, main = 'Histogram of Bull Rider Weights, xlab= 'Weight (lbs)')

#Error: unexpected symbol in "hist(bull$YearBorn, main = 'Histogram of Bull Rider Weights, xlab= 'Weight"
#hist(bull$YearBorn, main = 'Histogram of Bull Rider Weights, xlab= 'Weight (lbs)')

#Conduct the Analysis in R

#1. Did the histogram of the bull-riders' weights show any significant skew that
## [1] 4.063971 5.936029
#would violate the assumption of Normality?
#No

#Report the sample statistics for the bull-rider weights. (Round to 2 decimal places.)

#2a. Sample mean (in pounds)=
round(mean(USA$Weight),2)

#2b. Sample standard deviation (in pounds)=
## [1] 32.10378 72.09622
round(sd(USA$Weight),2)

#One-sample t test results:
#3a. t-statistic (rounded to 1 decimal place) =-.17
t.test(USA$Weight, mu=190)
#3b. degrees of freedom for the test =36

#95% confidence interval:

#3c. Lower bound estimate, in pounds (rounded to 1 decimal place) =148.8
round(148.7660,1)
#3d. Upper bound estimate, in pounds (rounded to 1 decimal place) = 157.5
round(157.4502,1)

#4. The p-value of the test was very small (< 0.05). How should we interpret this p-value?
#If bull-riders really do weigh 190 pounds on average, observing this sample mean is very unlikely.

#We should reject the hypothesis that the mean weight of bull riders
#is equal to 190 lbs. It appears that the bull-riders actually weigh
#less than the average American man.

#Primary Research Question

#The average American adult male weighs 190 pounds.  Do professional
#bull-riders in the US weigh the same?

#Write Your Conclusion

#Answer the question and support your answer with statistics:

# The distribution of weight for this sample of professional
# bull riders is approximately normal with a mean of 153.11 lbs and a
# standard deviation of 13.02 lbs. We found that their mean weight
# is significantly different than 190 lbs, with t= -17.2,
# p less than 0.05. We are correct confident that the true mean
# of professional bull riders is between 148.8 lbs and 157.5 lbs
# suggesting that professional bull riders weigh less than
# the average adult male.


#############Week 2: Lab##########
#import dataset
bull<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/f544a6f73ec30abf4c6675772ae5f8c7/asset-v1:UTAustinX+UT.7.11x+2T2017+type@asset+block/BullRiders.csv")

# Over 1,200 bull riders from around the world are members of
# Professional Bull Riders (PBR) and compete in the more than
# 300 PBR affiliated bull riding events per year. This data
# set includes information about the top 50 ranked bull riders 
# for the 2013, according to the PBR standings reported in July
# of 2013. Rankings are based on a system which awards points
# for qualified rides at events throughout the season.  
## [1] 49.44444
## [1] 10.38161
## [1] 9
#Review of the One-Sample t-Test

#In this lab, you will use a one-sample t-test to answer
#a question of interest. Let's start by remembering why we
## [1] 68.875
#use hypothesis tests.
## [1] 12.2991
## [1] 8
#1a. What is the goal of a hypothesis test?
#ANSWER: To determine if the sample data is consistent, or
#inconsistent, with the null hypothesis about the population.

#1b. For your test result to be considered trustworthy,
#your data must meet the assumptions for a one-sample t-test.
#Which of the following is not an assumption of this test?
#Answer: NOT AN ASSUMPTION--The data was collected voluntarily from all subjects. 

#THE FOLLOWING ARE THE CRITERIA:

# 1.The sample is made up of independent observations.
# 2.The population distribution should be nearly Normal, or the sample should be large.
# 3.A random sample is used.

#Lab Preparation

#2. One of the following questions will be answered in this
#lab using a one-sample t-test. Select the question that
#can be answered with this method.
#Is the average ride percentage of professional bull riders at least 50%?

#Analyze the Data
#Primary Research Question
## [1] -3.49639
## [1] -3.496
#Do professional bull riders stay on their bulls 50% of the
#time? Test the hypothesis that the mean ride percentage
#is 0.500 in 2014, using riders with at least 5 events in 2014. 
## [1] 7
# Analysis
# 
# Let’s break this question down into the different
# descriptive statistics that you will need to construct
# your answer. Be sure that your R output includes all
# of the following components.
# 
# 1. Select the riders that participated in at least
#5 events in 2014.

events2014<-sqldf('SELECT *
                FROM bull
## [1] FALSE
                WHERE Events14 >= 5')
## [1] FALSE
# 2. Calculate the sample mean and standard deviation
#of ride percentage in 2014.
mean(events2014$RidePer14)
sd(events2014$RidePer14)
## [1] -3.49639
## [1] 2.365
# 3. Generate a histogram to look at the distribution
## [1] -2.365
#of the ride percentage in 2014.
hist(events2014$RidePer14)

# 4. Confirm the assumptions of a one-sample t-test.
#One-sample t test results:
# H0 :μ>=.50
# Ha ≮.50

# 5. Run the t-test and interpret the results.
t.test(events2014$RidePer14, mu=.50)

#Descriptive Statistics

#1a. What was the average ride percentage in 2014
#for this sample? (Round to 3 decimal places.)
round(mean(events2014$RidePer14),3)

#1b. What was the standard deviation of ride percentage in 2014
#for this sample? (Round to 3 decimal places.)
round(sd(events2014$RidePer14),3)

#Test Statistics

#2a. What is the value of the t-statistic? (Round absolute value to 2 decimal places.)
t.test(events2014$RidePer14, mu=.50)
round(-10.054,2)

#2b. How many degrees of freedom?
41

#2c. The p-value was _______ 0.001.
#less than
## [1] 9
#3. What decision should you reach, based on these test results?
#reject the null hypothesis

#What is the appropriate conclusion for this test?
#Professional bull riders do not stay on their bulls 50% of the time

#Draw Conclusions

# The distribution of the percentage of time a professional bull
# rider stays on the bull for this sample is approximately normal,
# with a mean of 33.50% , and a  standard deviation 
# of 10.7%. We found that their mean ride percentage is
# significantly different from 50%, with t=-10.05,
# p<0.05. We are 95% confident that the true mean of ride

# percentage of professional bull riders is between
# 30.1% and 36.8% suggesting that professional bull riders
# ride the full 8 seconds about 1/3 of the time.

#############Week 2: Problem Set##########
## [1] 1.39554
#Question 1

# How much money do professional bull riders earn by participating
## [1] 8
# in an event?

#1. Create a new variable that equals the "average earnings
## [1] 2.5478
#per event" in the 2012 season for each bull rider in the
#dataset. Call this new variable "earnings_per"

bull$earnings_per<-bull$Earnings12/bull$Rides12
## [1] 2.306004
#2. Make a histogram of your "earnings per event" variable.
## [1] TRUE
hist(bull$earnings_per)

#3. Use this data to answer the following questions.

#1a. Have we met the assumptions for being able to calculate
#a 95% confidence interval to estimate the true mean
#earnings-per-event for a professional bull rider (using t)?
#Use the histogram to help answer this question.
## 
##  Paired t-test
## 
## data:  test_scores$pre_test and test_scores$post_test
## t = -2.5478, df = 8, p-value = 0.03429
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.7736757 -0.3374354
## sample estimates:
## mean of the differences 
##               -3.555556
#ANSWER: No, the distribution of "earnings_per" is positively skewed, with an outlier
## [1] TRUE
# When a variable is highly skewed, we can transform the data into a shape that allows us to conduct our analysis. 
## [1] FALSE
# 
# 1. Create a new variable that is the log of your "earnings_per" variable.  
# 
# 2. Here is the code to make a log transformation of a variable: 
# 
  #bull$newvariable <- log(bull$originalvariable)
  bull$earnings_per_log <- log(bull$earnings_per)
  hist(bull$earnings_per_log) #check results
  
# 3. Now use this new variable to answer the following questions. 

  # 1b. Make a histogram of this log-transformed variable.
  # Notice how the distribution shape has changed.
  # Can we reliably calculate a 95% confidence interval
  # for the mean of this transformed variable?
  # ANSWER: Yes, the distributuon of the log-transformed
  # variable looks relatively normal (some slight positive skew).
  
#1c. What is the mean of the log-transformed earnings-per-event
#variable? (Round to 2 decimal places.)
  round(mean(bull$earnings_per_log, na.rm=TRUE),2)
  #Answer: 8.85
  


#1d. What are the lower and upper-bounds for a 95% confidence interval
  #around this transformed mean? (Round each to 2 decimal places.)
  transformed_mean<-8.85
## [1] 5.333333
  t.test(bull$earnings_per_log, mu=8.85)
## [1] 12.8767
#Lower-bound
  round(8.572169,2)
  
#Upper-bound(Round to 2 decimal places.)
  9.12
  
# To best interpret the 95% confidence interval, 
# we need to transform the lower and upper bound
# estimates back into dollars/events.  
# 
# 1. Here is the code you will use to take a log-transformed
# value back to the original units:

 lowerboundvalue <-8.572169
exp(lowerboundvalue)
round(5282.575,0)

  upperboundvalue<-9.12
exp(upperboundvalue)

# 2. Run this code on the unrounded original values.
# Then answer the questions that follow.
## [1] 3.324751
#1e. What are the lower and upper-bounds for a 95%
#confidence interval in dollars/event units.
#(Round each to whole numbers with no decimal places.)
## [1] 14
lowerboundvalue

#Upper-bound(Round to whole number with no decimal places.)
## [1] 1.60413
## 
##  Paired t-test
## 
## data:  bp$exam6 and bp$exam7
## t = 1.6041, df = 14, p-value = 0.131
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.797548 12.464215
## sample estimates:
## mean of the differences 
##                5.333333
upperboundvalue

#Question 2
#Students collected 8 random bags of a specific brand of
#potato chips and carefully weighed the contents of each bag,
## [1] 2.144787
#recording the following weights (in grams): 

chips<-c(29.4, 29.0, 28.4, 28.8, 28.9, 29.3, 28.5, 28.2)
## [1] TRUE
#The students want to test the claim that the mean
#weight of these bags is 28.5 grams.  They think it may be
#different. 

#2a. What is the appropriate null hypotheses for this test?
## [1] 12.464215 -1.797548
## [1] -12.464215   1.797548
## [1] -12.46   1.80
#This is a two-tail test
            # H0 :μ=28.5 
            # Ha :μ̸=28.5
## [1] FALSE
## [1] TRUE
#2b. What are the sample mean and standard
#deviation? (Round each to 2 decimal places.)
round(mean(chips),2)
round(sd(chips),2)

#2c. What is the test statistic for this hypothesis test?
#This is the t-statistic for the sample mean.
#(Round to 2 decimal places.)
#NOTE: Be sure to use the proper formula and
#your rounded answers to the previous questions to
#determine the statistic. If you use R or do not use
#the rounded values, your answer may be marked incorrect.
pop_mean<-28.5
pop_sd<-.43
sample_mean<-28.81 #this is a sample mean, not an individual observation
sample_size<-8
t_test<-(sample_mean-pop_mean)/(pop_sd/sqrt(sample_size))
round(t_test,2)

t.test(chips, mu=28.5) #using base R

#2d. What is t-critical for this test, assuming an alpha
#level of 0.05? (Round to 3 decimal places.)
##The critical value is either a t-score or a z-score.
# We know that we have 8 observations, so our degrees of 
#     freedom for this test is 7. 7 degrees of
#     freedom at the 0.05 significance level gives us a critical
#     value of ± 2.3646. (.025 on the upper and .025 on the lower)
#browseURL("https://www.stat.wisc.edu/courses/st371-hanlon/hand/table/tTable.pdf")
      degreesFreedom<-sample_size-1    
      critical_value<-2.3646
      round(critical_value,3)

#2e. What was the outcome of your test?
#Fail to reject the null hypothesis 

#2f. In addition to random selection, what other condition of
      #the data must be true for our t-test outcome to be reliable?
#ANSWER: The bags of potato chips must have an approximately Normal population distribution for weight.      
      
#2g. Does your data provide sufficient evidence to suggest that
      #the mean weight of these bags of potato chips is not 28.5 g per bag?
#ANSWER:No.

      #rm(list = ls())  # Clean up
## [1] 2.37918
#Question 3
## [1] 2.379
# An industrial plant dumps its waste into a nearby river, but claims
# that it is not impacting the native species of frogs that live in
# the river.  The frogs are able to tolerate calcium concentrations up
# to 91 mg/L.  
# 
# You measure the concentration of calcium in 25 random samples
## [1] 2.38
# from the river.  Your measurements are approximately normally
# distributed, with a mean of 93.6 mg/L, with a standard deviation
# of 7.8 mg/L.
## [1] 9
# 3a. What is the appropriate alternative hypothesis if the
# industrial plant's runoff is believed to be producing higher
# calcium concentrations than are deemed acceptable for the frogs?
# Let 𝝁 represent the true calcium concentration in the river
# downstream from the plant.
      #left one-tail
## [1] -1.8331
      # H0 :μ≤91
## [1] -1.833113
      # Ha :μ>91
## [1] 2.37918
pop_mean<-91
pop_sd<-7.8
sample_mean<-93.6
sample_size<-25
t_test<-(sample_mean-pop_mean)/(pop_sd/sqrt(sample_size))
round(t_test,2)

# The critical value is either a t-score or a z-score.
# We know that we have 25 observations, so our degrees of 
## [1] 0.04128316
#     freedom for this test is 24. 24 degrees of
#     freedom at the 0.05 significance level gives us a critical
#     value of ±  1.7109. (.05 on the lower)
#browseURL("https://www.stat.wisc.edu/courses/st371-hanlon/hand/table/tTable.pdf")
      degreesFreedom<-sample_size-1    
      critical_value<- -1.7109
      round(critical_value,3)
      t_test < critical_value #we reject null hypothesis if true
      #Conclusion: fail to reject H0
      
#3b. Calculate the test statistic. (Round to 2 decimal places.)
      round(t_test,2)

#3c. What is the t-critical value? Assume an alpha level of .05.(Round to 3 decimal places.)
      round(critical_value,3)
      #ANSWER: the positive-- 1.711
      
#3d. Does your data provide sufficient evidence to suggest that the calcium concentration in the river is more than 91 mg/L?
      #No
      
#3e. Suppose as part of a broader investigation into the plant's impact
      #on the river's ecosystem, an environmental group conducted a
      #large-scale study and found that the actual mean calcium concentration
      #level downstream from the plant is 95 mg/L. Did you make an error
      #in your hypothesis test, and if so, what type was it?
      #ANSWER: Yes, a Type II Error

      #rm(list = ls())  # Clean up

#Question 4

# You are studying a population of peregrine falcons and want
# to estimate their average wingspan.  So you collect a random
# sample of 12 adult male birds and measure a mean wingspan
# of 42.6 cm, with a standard deviation of 5.3 cm. 
# 
# Assume that the distribution of measurements was approximately normal.
## [1] 11.14159
pop_sd<-5.3
sample_mean<-42.6
sample_size<-12
## [1] 2.243845
## [1] 2.244
# The critical value is either a t-score or a z-score.
# We know that we have 12 observations, so our degrees of 
#     freedom for this test is 11. 11 degrees of
## [1] 21
#     freedom at the 0.10 significance level gives us a critical
#     value of ±  1.7959 . (.05 on the lower and .05 on the upper)
#browseURL("https://www.stat.wisc.edu/courses/st371-hanlon/hand/table/tTable.pdf")
## [1] -2.079614
      degreesFreedom<-sample_size-1    
      critical_value1<- - 1.7959
      critical_value2<-  1.7959
criticalValue<-print(qt(.025, df = 11)) #R method: we always have a positive critical value
## [1] 25
#4a. What is t-critical for a 90% confidence interval?
      #(Report as a positive value rounded to 3 decimal places.)
## [1] 1.829798
      round(1.7959,3)
## [1] 48.1702
#4b. Calculate a 90% confidence interval for the mean wingspan
#for the population of male peregrine falcons. (Round to 2 decimal places.)
## [1] 0.03575082
      #95% confidence interval--critical_value
upperBound<-sample_mean+1.7959*(pop_sd/sqrt(sample_size)) #to the right of the mean
lowerBound<-sample_mean-1.7959*(pop_sd/sqrt(sample_size))#to the left of the mean
round(print(c(lowerBound, upperBound)),2)
     
# Lower-bound
   39.85   

#Upper-bound(Round to 2 decimal places.)
   45.35
      
#4c. If you calculated a 95% confidence interval for the population mean
      #from the same data, would your confidence interval be narrower
      #or wider than your interval above?
upperBound<-sample_mean+2.2010*(pop_sd/sqrt(sample_size)) #to the right of the mean
lowerBound<-sample_mean-2.2010*(pop_sd/sqrt(sample_size))#to the left of the mean
round(print(c(lowerBound, upperBound)),2)
#Wider

#browseURL("https://onlinecourses.science.psu.edu/statprogram/node/137") #Stats Cour

##################Inference for a mean example: Statistics with R#######################

# Estimate the average after-lunch snack consumption (in grams) of people
# who eat lunch distracted using a 95% confidence interval

pop_sd<-45.1
sample_mean<- 52.1#x bar
sample_size<-22
sem<-pop_sd/sqrt(sample_size)#standard error of the mean as standard deviation/squa
criticalValue<-print(qt(.025, df = 21)) #R method: we always have a positive critical value
## [1] 2.595558
right<-sample_mean + criticalValue * sem
left<-sample_mean - criticalValue * sem
## [1] -1.566908
## [1] -1.567
print(c(left, right))
#Conclusion: We are 95% confident that distracted eaters consume between
#32.1 to 72.1 grams of snacks post-meal
## [1] 9
#rm(list = ls())  # Clean up
## [1] -2.262157
a<-5
s<-2
## [1] -4.067
n<-20
error <- qt(0.975,df=n-1)*s/sqrt(n)
left <- a- error
## [1] -9.93856
right<- a + error
## [1] 1.80456
print(c(left, right))
## [1] 0.1515789
a<-52.1 #sample mean
s<-45.1 #sample standard deviation
n<-22 #samplize size
error <- qt(0.975,df=n-1)*s/sqrt(n)
left <- a- error
right<- a + error
print(c(left, right))

########################Week 3: Hypothesis Testing (Two Group Means)  ################

#################Independent Samples t-test##########################


#Independent Samples

#When we work with two independent samples, our null hypothesis
## 
##      One-sample t test power calculation 
## 
##               n = 20
##           delta = 1.5
##              sd = 2
##       sig.level = 0.05
##           power = 0.8888478
##     alternative = two.sided
#assumes that if the samples are selected at random, the samples
#will vary only by chance and the difference will not be
#statistically significant. In short, when we have independent
## 
##      Two-sample t test power calculation 
## 
##               n = 1000
##           delta = 1.5
##              sd = 2
##       sig.level = 0.05
##           power = 1
##     alternative = two.sided
## 
## NOTE: n is number in *each* group
#samples we assume that the scores of one sample do not affect
#the other.

#Let’s see how the new hypothesis statements look in our
#t-statistic formula: t = (x ̄1−x ̄2)−(μ1−μ2)/SE(x ̄1 −x ̄2 )

# Where:
# x ̄1 − x ̄2 is the difference between the sample means
# μ1 − μ2 is the difference between the hypothesized population means
## Observations: 214
## Variables: 26
## $ ID               <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
## $ gender           <chr> "Female", "Female", "Female", "Female", "Fema...
## $ age              <int> 19, 19, 18, 19, 18, 19, 18, 18, 18, 18, 19, 1...
## $ classification   <chr> "Sophomore", "Sophomore", "Freshman", "Sophom...
## $ happy            <int> 89, 90, 60, 100, 85, 90, 80, 75, 97, 75, 90, ...
## $ sleep_Tues       <dbl> 8.0, 7.0, 9.0, 8.5, 8.0, 6.0, 5.0, 8.0, 8.0, ...
## $ sleep_Sat        <dbl> 13.0, 10.0, 9.0, 8.5, 10.0, 10.0, 9.0, 10.0, ...
## $ hair_color       <chr> "brown", "black", "brown", "black", "black", ...
## $ exclusive        <int> 1, 2, 1, 10, 10, 10, 5, 6, 6, 10, 5, 4, 10, 1...
## $ greek            <chr> "no", "yes", "no", "no", "no", "no", "no", "n...
## $ smoke            <chr> "no", "no", "no", "no", "no", "no", "no", "no...
## $ talking_min      <int> 20, 20, 30, 300, 420, 3, 15, 3, 7, 350, 4, 20...
## $ texts_sent       <int> 100, 50, 121, 1000, 600, 100, 500, 60, 45, 40...
## $ live_campus      <chr> "no", "no", "yes", "no", "yes", "no", "yes", ...
## $ roomates         <int> 2, 3, 1, 3, 1, 3, 1, 1, 1, 1, 2, 1, 1, 1, 3, ...
## $ austin           <dbl> 9, 10, 8, 8, 8, 10, 10, 8, 10, 7, 6, 9, 7, 7,...
## $ commute          <chr> "I take the bus.", "I walk.", "I take the bus...
## $ UT_sport         <chr> "no", "no", "no", "yes", "no", "yes", "no", "...
## $ major            <chr> "Nutrition", "Human Development", "Biology", ...
## $ hw_hours_HS      <dbl> 0, 2, 1, 0, 2, 10, 5, 1, 10, 8, 2, 5, 4, 10, ...
## $ hw_hours_college <int> 3, 8, 6, 5, 31, 20, 10, 6, 20, 12, 10, 15, 10...
## $ post_happy       <int> 80, 90, 86, 50, 85, 80, 60, 75, 97, 75, 90, 8...
## $ post_exclusive   <int> 1, 1, 1, 10, 10, 10, 8, 6, 5, 10, 1, 1, 7, 10...
## $ post_smoke       <chr> "no", "no", "no", "no", "no", "no", "no", "no...
## $ post_talking_min <int> 5, 30, 105, 36, 120, 40, 1000, 3, 10, 100, 12...
## $ post_text_sent   <int> 50, 60, 100, 250, 1500, 120, 900, 50, 110, 10...
# SE(x ̄1−x ̄2) is the standard error of the difference between the sample means

# When we solve the upooled independent samples t-test by hand,
# the conservative approach is to use the lowest n of the two groups
# minus one:
# df =nlowest −1

#Step 1: Hypothesis Statements
# H0 :μ1 = μ2 or H0 :μ1 - μ2 = 0
# Ha :μ1 ≠ μ2 or Ha: μ1 - μ2 ≠ 0

#Hypothesis Step 2: Identify the appropriate significance level and confirm the test assumptions.

#Hypothesis Step 3: Analyze the data and generate the test statistic.

# Hypothesis Step 4: Interpret your results.


################Example: Independent t-test##################

# The head of the English department is interested in the difference in
# writing scores between freshman English students who are taught by
# different teachers. The incoming freshmen are randomly assigned to
# one of two English teachers and are given a standardized writing test
# after the first semester. We take a sample of eight students from one
# class and nine from the other. Is there a difference in achievement
## 
##  Paired t-test
## 
## data:  post$exclusive and post$post_exclusive
## t = 3.2243, df = 213, p-value = 0.001462
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.2342792 0.9713283
## sample estimates:
## mean of the differences 
##               0.6028037
# on the writing test between the two classes?

sample1<-c(35,51,66,42,37,46,60,55,53)
sample1_mean<-print(mean(sample1))
sample1_sd<-print(sd(sample1))
n1<-print(length(sample1))
pop_mean1<-0

sample2<-c(52,87,76,62,81,71,55,67)
sample2_mean<-print(mean(sample2))
sample2_sd<-print(sd(sample2))
n2<-print(length(sample2))
pop_mean2<-0

#Step 1: Hypothesis Statements
#We will be testing to see if the mean score of the two classes are equal to one another:
# H0 :μclass1 = μclass2 or H0 :μclass1 - μclass2 = 0
# Ha :μclass1 ≠ μclass2 or Ha: μclass1 - μclass2 ≠ 0
## [1] 0.6028037
#Hypothesis Step 2: Identify the appropriate significance level and confirm the test assumptions.
# We’ll use the standard significance test of 0.05. We were told
# that students were randomly assigned, and we’ll assume that students
# did not switch classes (for independence), and we’ll assume the student
# score are independent from one another. We’ll assume the underlying
# population of students in each class is nearly normal in the distribution
## 
##  Paired t-test
## 
## data:  post$exclusive and post$post_exclusive
## t = 3.2243, df = 213, p-value = 0.9993
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##       -Inf 0.9116654
## sample estimates:
## mean of the differences 
##               0.6028037
# of scores.

#Hypothesis Step 3: Analyze the data and generate the test statistic.
#Now we’ll use our t-test to get to the analysis.

#First, our standard error of the difference between the sample means:
#EQUATION: S E ( x ̄ 1 − x ̄ 2 )
sem<-sqrt(((sample1_sd*sample1_sd)/n1) + ((sample2_sd*sample2_sd)/n2))

#Second, we will use that SE in the t-test equation:
t_test<-print(((sample1_mean-sample2_mean) - (pop_mean1-pop_mean2))/sem)
round(t_test,3)

#Third, degrees of freedom
degreesFreedom<-print(min(n1,n2)-1)

#Forth, the critical value
#We know that We know that our smallest group has just eight students, so
#     the freedom for this test is 7. 7 degrees of
#     freedom at the 0.05 significance level gives us a critical
#     value of ± 2.3646 . (.025 on the lower and .025 on the upper)
#browseURL("https://www.stat.wisc.edu/courses/st371-hanlon/hand/table/tTable.pdf")
      degreesFreedom<-sample_size-1    
#The critical value of the t-distribution for 7 degrees of freedom is ± 2.365.
criticalValue1<-2.365
criticalValue2<--2.365

t_test>criticalValue1

t_test>criticalValue2

#Hypothesis Step 4: Interpret your results.
#H0 :μclass1 - μclass2 = 0

#Ha: μclass1 - μclass2 ≠ 0
t_test
criticalValue1
criticalValue2
#Values supporting the null hypothesis occur in the green region
## 
##  Welch Two Sample t-test
## 
## data:  fsleep and msleep
## t = -0.41561, df = 27.979, p-value = 0.3404
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##       -Inf 0.3968303
## sample estimates:
## mean of x mean of y 
##  6.809211  6.937500
#Values supporting the alternative hypothesis occur in the red region (within the critical regions)
## 
##  Welch Two Sample t-test
## 
## data:  fsleep and msleep
## t = -0.41561, df = 27.979, p-value = 0.6596
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -0.6534092        Inf
## sample estimates:
## mean of x mean of y 
##  6.809211  6.937500
#The critical values of the test are -2.365 and 2.365,
#as these are the dividers between values supporting the
#alternative and null hypothesis. The area in red can also be
#seen as the rejection region, since an observed value in this
#region indicates that the null hypothesis should be rejected.
#t_test value = -3.49639
## 
##  Welch Two Sample t-test
## 
## data:  fsleep and msleep
## t = -0.41561, df = 27.979, p-value = 0.6809
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.7606145  0.5040355
## sample estimates:
## mean of x mean of y 
##  6.809211  6.937500
#Because our calculated t-value is outside the t-critical value
#(our value falls in the critical region of the t- distribution),
#we reject our Null hypothesis. We conclude that the populations of
#students in the two classes significantly differ in their standardized
#test scores at the end of the semester. As the two classes were randomly
#assigned, we can plausibly conclude that the difference in the scores
#was due to the class assignment –which class the students were in
#(and whatever teaching technique was used).

#rm(list = ls())  # Clean up

#############Example: Dependent sample t-test###########################
# A math teacher wants to determine the effectiveness of her statistics
# lesson. She gives a simple skills test to nine students before the
# start of class (a pre-test) and the same skills test to the same
# students at the end of class (a post-test).

pre_test<-c(78,67,56,78,96,82,84,90,87)
post_test<-c(80,69,70,79,96,84,88,92,92)
test_scores<-data.frame(cbind(pre_test, post_test))
test_scores$diff_test_scores<-test_scores$pre_test - test_scores$post_test
n<-print(length(test_scores$diff_test_scores))

stand_dev_diff<-sd(test_scores$diff_test_scores)
mean_of_difference<-abs(mean(test_scores$diff_test_scores))

#Hypothesis Step 1: Clearly state the Null and Alternative Hypothesis.

# H0 :δ = 0
# HA :δ ≠ 0

#Hypothesis Step 2: Identify the appropriate significance level
#and confirm the test assumptions.
# assume the standard significance level of 0.05
# students are random and independent
# distribution of all possible difference scores in the population are nearly normally distributed.
## Observations: 214
## Variables: 26
## $ ID               <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
## $ gender           <chr> "Female", "Female", "Female", "Female", "Fema...
## $ age              <int> 19, 19, 18, 19, 18, 19, 18, 18, 18, 18, 19, 1...
## $ classification   <chr> "Sophomore", "Sophomore", "Freshman", "Sophom...
## $ happy            <int> 89, 90, 60, 100, 85, 90, 80, 75, 97, 75, 90, ...
## $ sleep_Tues       <dbl> 8.0, 7.0, 9.0, 8.5, 8.0, 6.0, 5.0, 8.0, 8.0, ...
## $ sleep_Sat        <dbl> 13.0, 10.0, 9.0, 8.5, 10.0, 10.0, 9.0, 10.0, ...
## $ hair_color       <chr> "brown", "black", "brown", "black", "black", ...
## $ exclusive        <int> 1, 2, 1, 10, 10, 10, 5, 6, 6, 10, 5, 4, 10, 1...
## $ greek            <chr> "no", "yes", "no", "no", "no", "no", "no", "n...
## $ smoke            <chr> "no", "no", "no", "no", "no", "no", "no", "no...
## $ talking_min      <int> 20, 20, 30, 300, 420, 3, 15, 3, 7, 350, 4, 20...
## $ texts_sent       <int> 100, 50, 121, 1000, 600, 100, 500, 60, 45, 40...
## $ live_campus      <chr> "no", "no", "yes", "no", "yes", "no", "yes", ...
## $ roomates         <int> 2, 3, 1, 3, 1, 3, 1, 1, 1, 1, 2, 1, 1, 1, 3, ...
## $ austin           <dbl> 9, 10, 8, 8, 8, 10, 10, 8, 10, 7, 6, 9, 7, 7,...
## $ commute          <chr> "I take the bus.", "I walk.", "I take the bus...
## $ UT_sport         <chr> "no", "no", "no", "yes", "no", "yes", "no", "...
## $ major            <chr> "Nutrition", "Human Development", "Biology", ...
## $ hw_hours_HS      <dbl> 0, 2, 1, 0, 2, 10, 5, 1, 10, 8, 2, 5, 4, 10, ...
## $ hw_hours_college <int> 3, 8, 6, 5, 31, 20, 10, 6, 20, 12, 10, 15, 10...
## $ post_happy       <int> 80, 90, 86, 50, 85, 80, 60, 75, 97, 75, 90, 8...
## $ post_exclusive   <int> 1, 1, 1, 10, 10, 10, 8, 6, 5, 10, 1, 1, 7, 10...
## $ post_smoke       <chr> "no", "no", "no", "no", "no", "no", "no", "no...
## $ post_talking_min <int> 5, 30, 105, 36, 120, 40, 1000, 3, 10, 100, 12...
## $ post_text_sent   <int> 50, 60, 100, 250, 1500, 120, 900, 50, 110, 10...
hist(test_scores$diff_test_scores)


#Hypothesis Step 3: Analyze the data.
## [1] "Freshman"  "Sophomore" "Freshman"  "Freshman"  "Sophomore" "Junior"
#3A: Calculate the Standard Error
standard_error<-print(stand_dev_diff/sqrt(n))

#3B: Calculate degrees of freedom
## 
##  no yes 
##  50  50
degreesFreedom<-print(n-1)

#3C: Calculate the t-statistic
t_test<-print((mean_of_difference-0)/standard_error)

#3D:Calculate Critical Value
critical_value<-qt(.025, df = degreesFreedom) #R method: we always have a positive critical value
abs(critical_value)

#Hypothesis Step 4: Interpret your results.
t_test>critical_value #if true, reject null hypothesis

#As our calculated t-statistic is greater than our
#t-critical value (our value lies in the critical region),
#we reject our Null hypothesis and conclude that there was in fact
#a change in student performance from the Pre-test to the Post-test.

# Run dependent t-test using base R
t.test(test_scores$pre_test, test_scores$post_test, paired=T)
p_value<-2*pt(-abs(t_test),df=degreesFreedom)
p_value <= .05 #reject null hypothesis if true
p_value>.05 #fail to reject null hypothesis if true
#Reject the null hypothesis

#rm(list = ls())  # Clean up

##############Example: Dependent Sample t-test with confidence interval#########
# Example:
# 
# In the Framingham Offspring Study, participants attend clinical
# examinations approximately every four years. Suppose we want to
# compare systolic blood pressures between examinations (i.e., changes
# over 4 years). The data below are systolic blood pressures measured
# at the sixth and seventh examinations in a subsample of n=15 randomly
# selected participants. Since the data in the two samples (examination
# 6 and 7) are matched, we compute difference scores by subtracting
# the blood pressure measured at examination 7 from that measured at
# examination 6 or vice versa. [If we subtract the blood pressure
# measured at examination 6 from that measured at examination 7,
# then positive differences represent increases over time and negative
# differences represent decreases over time.]

exam6<-c(168, 111, 139, 127, 155, 115, 125, 123, 130, 137,
  130, 129, 112, 141, 122)
exam7<-c(141, 119, 122, 127, 125, 123, 113, 105, 131, 142,
  131, 135, 119, 130, 121)

bp<-data.frame(cbind(exam6, exam7))
bp$difference<-bp$exam6 - bp$exam7
mean_difference<-print(abs(mean(bp$difference))) #Xd
sd_difference<-print(sd(bp$difference))

n<-15

#When samples are matched or paired, difference scores

#are computed for each participant or between members

#of a matched pair, and "n" is the number of participants
#or pairs, x bar is the mean of the difference scores, and
#Sd is the standard deviation of the difference scores
## 
##  Welch Two Sample t-test
## 
## data:  underclass_happy and upperclass_happy
## t = 0.42302, df = 35.358, p-value = 0.6748
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -5.210391  7.954653
## sample estimates:
## mean of x mean of y 
##  79.67213  78.30000
#Hypothesis Step 1: Clearly state the Null and Alternative Hypothesis.
#we want to compare bp over 4 years
# H0 :δ = 0
# HA :δ ≠ 0

#Hypothesis Step 2: Identify the appropriate significance level
#and confirm the test assumptions.

# assume the standard significance level of 0.05
# distribution of all possible difference scores in the
#population are nearly normally distributed.
## 
##  Paired t-test
## 
## data:  post$happy and post$post_happy
## t = 1.6838, df = 213, p-value = 0.09368
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.2168971  2.7589532
## sample estimates:
## mean of the differences 
##                1.271028
hist(bp$difference)

#Hypothesis Step 3: Analyze the data.
#In this sample, we have n=15, the mean difference score = -5.3 and sd = 12.8, respectively. 

#3A: Calculate the Standard Error
standard_error<-print(sd_difference/sqrt(n))

#3B: Calculate degrees of freedom
degreesFreedom<-print(n-1)

#3C: Calculate the t-statistic #if n>30 use z table
#if n<30, use t-table with df=n-1
t_test<-print((mean_difference-0)/standard_error)
t.test(bp$exam6, bp$exam7, paired=T)
## [1] -15
#3D:Calculate Critical Value
critical_value<-qt(.025, df = degreesFreedom) #R method: 
#we always have a positive critical value
abs(critical_value)

#Hypothesis Step 4: Interpret your results.
t_test>critical_value #if true, reject null hypothesis

#Calculate Confidence Interval

upperBound<-mean_difference + critical_value * (sd_difference/sqrt(n)) #to the right of the mean
lowerBound<-mean_difference - critical_value * (sd_difference/sqrt(n)) #to the left of the mean
print(c(lowerBound, upperBound))
round(print(c(0-lowerBound, abs(upperBound))),2) #the real confidence values

p_value<-.131
p_value <= .05 #reject null hypothesis if true
p_value>.05 #fail to reject null hypothesis if true
#Fail to reject H0

#Interpretation:

#We are 95% confident that the mean difference in systolic
## [1] 79.7
#blood pressures between examinations 6 and 7 (approximately
#4 years apart) is between -12.4 and 1.8. The null (or no effect)
#value of the CI for the mean difference is zero.   Therefore,
#based on the 95% confidence interval we can conclude that there

#is no statistically significant difference in blood pressures
## [1] 78.3
#over time, because the confidence interval for the mean difference
#includes zero. 


###########################Video Lectures###########################
## 
##  Welch Two Sample t-test
## 
## data:  underclass_happy and upperclass_happy
## t = 0.42302, df = 35.358, p-value = 0.6748
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -5.210391  7.954653
## sample estimates:
## mean of x mean of y 
##  79.67213  78.30000
## [1] 0.423
# 1. An instructor for an animal learning course observes
# that some students are very comfortable working with rats
# during their animal training sessions. She suspects that
# these students may have experience with pets that gives them
# an advantage when training their rats. To test her hypothesis,
# she divides her students into two groups:  those that currently
# have a pet at home, and those that don't. Below are the rats'
# performances for each group.  Higher means indicate better
## [1] TRUE
# performance.

#with pets
sample1_mean<-78
sample1_sd<-12.56
n1<-10
pop_mean1<-0

#no pets
sample2_mean<-66
sample2_sd<-12.04
n2<-15
pop_mean2<-0

#1a. What is the alternative hypothesis for this test?
# H0 :μpetOwners ≤ μnoPets
# Ha :μpetOwners > μnoPets

#1b. Solve for each missing component of the t-test equation:
#t=78−A(12.56)2B+(C)215
sem<-sqrt(((sample1_sd*sample1_sd)/n1) + ((sample2_sd*sample2_sd)/n2))
## 
##  Paired t-test
## 
## data:  post$happy and post$post_happy
## t = 1.6838, df = 213, p-value = 0.09368
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.2168971  2.7589532
## sample estimates:
## mean of the differences 
##                1.271028
t_test<-print(((sample1_mean-sample2_mean) - (pop_mean1-pop_mean2))/sem)
## [1] 1.684
round(t_test,3)

a<-66
b<-10
c<-12.04
  
#1c. What is the t-statistic approximately?
round(t_test,2)
## [1] TRUE
## [1] FALSE
#1d. What is the t-critical value for this test, assuming df = nsmallest - 1 and α=0.05?
degreesFreedom<-print(min(n1,n2)-1)
#browseURL("https://www.stat.wisc.edu/courses/st371-hanlon/hand/table/tTable.pdf")
#     We know that We know that our smallest group has just 10 students, so
#     the freedom for this test is 9. 9 degrees of
#     freedom at the 0.05 significance level gives us a critical
#     value of  1.8331 . ( on the lower  1.8331)

criticalValue<- print(-1.8331)
qt(.05, df = 9) #R method: we always have a positive critical value
t_test

#1e. What is the appropriate conclusion for this test?

#The rats of students with pets performed significantly better than
#the rats of the other students.

#1f. How would the p-value for this test be reported?

#   A small p-value (typically ≤ 0.05) indicates strong evidence against the null hypothesis, so you reject the null hypothesis. (Your sample provides enough evidence that you can reject null hypothesis for the entire population).
#   A large p-value (> 0.05) indicates weak evidence against the null hypothesis, so you fail to reject the null hypothesis.

2*pt(-abs(t_test),df=degreesFreedom)
#p < 0.05
#Reject H0

#rm(list = ls())  # Clean up



###############Example: Estimating the difference between independent means###############

#point estimate ± margin of error
#(⨱1 - ⨱2) ± tdf*SE((⨱1 - ⨱2)
  
  #df for t statistic for inference on difference of 2 means
  #df<-min(n1-1, n2-1) or degreesFreedom<-print(min(n1,n2)-1)

#Estimate the difference between the average post-meal
#snack consumption between those who eat with and 
#without distractions

# biscuit intake   ⨱     sd       n
# solitare        52.1g   45.1g   22
# no distraction  27.1g   26.4g   22

#solitare
sample1_mean<-52.1
sample1_sd<-45.1
n1<-22
pop_mean1<-0

#no distraction
sample2_mean<-27.1
sample2_sd<-26.4
n2<-22
pop_mean2<-0

#First, our standard error of the difference between the sample means:
#EQUATION: S E ( x ̄ 1 − x ̄ 2 )
sem<-sqrt(((sample1_sd*sample1_sd)/n1) + ((sample2_sd*sample2_sd)/n2))
sqrt(((45.1*45.1)/22) + ((26.4*26.4)/22)) #with the actual numbers

#Second, we will use that SE in the t-test equation:
t_test<-print(((sample1_mean-sample2_mean) - (pop_mean1-pop_mean2))/sem)
round(t_test,3)

#Third, degrees of freedom
degreesFreedom<-print(min(n1,n2)-1)

#Forth, the critical value 95% confidence interval-2 tail
criticalValue<-print(qt(.025, df = degreesFreedom)) #R method: we always have a positive critical value

#Fifth, Estimate the difference between the average
means_difference<-(52.1-27.1) - ((0-0)/sem) 
means_difference<-print((sample1_mean - sample2_mean) - ((0-0)/sem)) 

#Sixth, calculate bounds of the confidence interval
confidence_interval1<-print(means_difference + criticalValue * sem) #to the right of the mean
confidence_interval2<-print(means_difference - criticalValue * sem) #to the left of the mean

#Seventh, Calculate p-value
2*pt(-abs(t_test),df=degreesFreedom)

#   A small p-value (typically ≤ 0.05) indicates strong evidence against the null hypothesis, so you reject the null hypothesis. (Your sample provides enough evidence that you can reject null hypothesis for the entire population).
#   A large p-value (> 0.05) indicates weak evidence against the null hypothesis, so you fail to reject the null hypothesis.


#Eight, interpret data
#Reject H0
#p value is .035 (less than .05)

#rm(list = ls())  # Clean up

#############Sample Problem###############



#standard method
sample1_mean<-72.333 #xbar 1
sample1_sd<-6.3237
n1<-12

pop_mean1<-0

#new method
## 
##  Paired t-test
## 
## data:  post$hw_hours_HS and post$hw_hours_college
## t = -16.812, df = 213, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -12.229720  -9.662804
## sample estimates:
## mean of the differences 
##               -10.94626
sample2_mean<-76.400 #xbar 2
sample2_sd<-5.8348
n2<-10
## [1] 10.95
pop_mean2<-0

#First, our standard error of the difference between the sample means:
#EQUATION: S E ( x ̄ 1 − x ̄ 2 )
sem<-sqrt(((sample1_sd*sample1_sd)/n1) + ((sample2_sd*sample2_sd)/n2))
sem
## [1] 16.81
#Second, we will use that SE in the t-test equation:
t_test<-print(((sample1_mean-sample2_mean) - (pop_mean1-pop_mean2))/sem)
## [1] 213
round(t_test,3)

#Third, degrees of freedom
degreesFreedom<-print(min(n1,n2)-1)
## [1] TRUE
#Forth, the critical value 95% confidence interval-2 tail
criticalValue<-print(qt(.025, df = degreesFreedom)) #R method: we always have a positive critical value

#Fifth, Estimate the difference between the averages
means_difference<-print((sample1_mean - sample2_mean) - ((0-0)/sem)) 

#Sixth, calculate bounds of the confidence interval
confidence_interval1<-print(means_difference + criticalValue * sem) #to the right of the mean
confidence_interval2<-print(means_difference - criticalValue * sem) #to the left of the mean

#Seventh, Calculate p-value
2*pt(-abs(t_test),df=degreesFreedom)

#   A small p-value (typically ≤ 0.05) indicates strong evidence against the null hypothesis, so you reject the null hypothesis. (Your sample provides enough evidence that you can reject null hypothesis for the entire population).
#   A large p-value (> 0.05) indicates weak evidence against the null hypothesis, so you fail to reject the null hypothesis.


#Eight, interpret data
#fail to reject Reject H0
#p value is 0.1515789 (greater than .05)

#rm(list = ls())  # Clean up

###########Video Lecture: Paired Samples t-test####################

power.t.test(n=20,delta=1.5,sd=2,sig.level=0.05,
## 
##  Welch Two Sample t-test
## 
## data:  nongreek_sleep and greek_sleep
## t = 0.98077, df = 62.948, p-value = 0.1652
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -0.2214398        Inf
## sample estimates:
## mean of x mean of y 
##  8.042647  7.727273
  type="one.sample", alternative="two.sided",strict = TRUE)

power.t.test(n=1000, delta=1.5,sd=2,sig.level=0.05,
  type="two.sample", alternative="two.sided",strict = TRUE)
## [1] 0.3
###########R Tutorial Video: Paired t test###########

#import dataset
post<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/57279e33ebd396756764815f562fb31d/asset-v1:UTAustinX+UT.7.21x+2T2017+type@asset+block/PostSurvey.csv")
## [1] 0.981
#inspect dataset
glimpse(post)
## [1] 63
# we asked them to beginning of the semester how likely they would think on a scale of 1 to 10
# they would be an exclusive relationship at the end of the year. So 1 means that there’s no way they would be
# in an exclusive relationship. 10 means that they’re definitely going to be an exclusive relationship at the end
## [1] 0.165
# of that year

hist(post$exclusive)
# And if we look at the distribution here, we see something that is not normal in the least. We’ve got a lot of
# low scores and lot of high scores and then a few in between. This doesn’t look symmetric. Similarly, if we
# look at our post-exclusive variable, we can see sort of the same shape.
## [1] FALSE
## [1] TRUE
hist(post$post_exclusive)
# So even though neither of these distributions are normal,
# the assumption of a paired t-test is that the distribution
# of the differences in measurements is normal.

#so let’s create a vector of those differences called dif
diff <- post$exclusive - post$post_exclusive

hist(diff)
# So this vector will represent the change in each student’s response between the first time we ask them and
# the second time. Now, if we look at a histogram of diff, we can see a somewhat symmetric distribution.
#So we are good to go on the normality assumption of our paired
#sample t-test.

t.test(post$exclusive, post$post_exclusive, paired = TRUE)
# So similar to our one sample t-test that we saw in a previous video, we’re going to
# be using the t.test() function. And instead of giving it just
# one vector of numbers, and then a value for mu or a hypothesized
# mean, we’re going to give it our two variables that we want to
# compare separated by comma.

# And again, we’re going to get the output of this paired t-test in
# our console window. 
# So it includes the tstatistic, the degrees of freedom,
# and our p value. We see that our p is less than alpha,
# less than 0.05. So we
# would reject the null hypothesis that the mean response to
# the first exclusive variable is equal to the mean of
# the second. And then we also see the mean of the differences
# from our sample. If we were to calculate the
# mean of diff, we would get this exact number from
# our paired t-test output 0.602.
mean(diff)

#So similar to a one sample t-test, if we had a one sided alternative
# hypothesis, we could add the alternative
# option and then give it either the “less” or “greater”
# word in quotes.
t.test(post$exclusive, post$post_exclusive, paired = TRUE,
alternative = "less")

###########R Tutorial Video: Independent t Test###########
#we’re going to compare two independent groups and see if the
#means are the same.

# So for this example, we’re going to look at the gender variable,
# where we have male or female students, and then how long they
# said they slept on Tuesday of that week. And this variable is
# giving us an idea of how much sleep these students are getting
# on a typical weekday. So if we want to compare if the mean number of
# hours of sleep on Tuesday is the same for female students as it is
# for male students, we’re going to use the same t.test() function,
# but we’re going to have to take an extra step first. We’re going
# to need to create two vectors that contain this value of
# “sleep_Tuesday” for both the female students and then the male
# students. So we can use logical indexing to do this. I’m going to
# create a vector called fsleep for female sleep, and I
# want to assign it the value of “sleep_Tuesday”, which was

# our variable. And then I want to index it and only

# select the cases where the gender variable equals “Female”.

#females
## 
##  Welch Two Sample t-test
## 
## data:  biology_majors and nursing_majors
## t = 0.62301, df = 30.886, p-value = 0.5379
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.539334  8.531491
## sample estimates:
## mean of x mean of y 
## -10.47059 -12.46667
fsleep <- post$sleep_Tues[post$gender == "Female"]

#males
## [1] FALSE
msleep <- post$sleep_Tues[post$gender == "Male"]
## [1] TRUE
hist(fsleep)
#This is roughly symmetric. There are no outliers. This is definitely a normal– approximately normal
#distribution.

hist(msleep)
#Again, we don’t see any serious skewness or outliers. So we are good to go on the normality assumption for
#our two sample independent t-test


t.test(fsleep, msleep, alternative = "less")
t.test(fsleep, msleep, alternative = "greater")
# similar to the paired example, we’re going to give it our two vectors that we want to compare
# the means. But we no longer have paired data, so we don’t need to give it this paired equals true option.
# The default for t.test is that the paired option is false. And that’s what we have here since we’re running an
# independent t-test. And again, if I wanted that one-sided alternative hypothesis, I could use the alternative
# option.

t.test(fsleep, msleep)
# And we get our output in our console window giving us the t-statistic of negative 0.4156, our degrees of
# freedom, and then also our p value. So because our p value here is greater than 0.05, we would fail to reject
# the null hypothesis that the means of these two groups are equal. Similar to the other t-test, we get a 95%
# confidence interval for the difference in the means, and then also our sample estimate– which is the mean
# hours of sleep for females, 6.809, and also for males, 6.938.
## 
##  Welch Two Sample t-test
## 
## data:  biology_majors and nursing_majors
## t = 0.62301, df = 30.886, p-value = 0.5379
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.539334  8.531491
## sample estimates:
## mean of x mean of y 
## -10.47059 -12.46667
## [1] 0.62
#rm(list = ls())  # Clean up
## [1] 30.89
#################Directional Hypothesis Tests####################
# NOTE:  If you are running directional hypotheses tests, remember that
#you must modify the code to reflect this direction.
## [1] 0.54
# A one-sided test looks like this:   
# 
# t.test(Variable1, Variable2, alternative = 'less'),
#when you expect Mean1 < Mean2

# t.test(Variable1, Variable2, alternative = 'greater'),
#when you expect Mean1 > Mean2
# 

################Pre-Lab 3: Post Student-Survey Data###################

# Primary Research Questions
# 
# 1.  Who is happier at the beginning of the semester:
#under-classmen or upper-classmen?
# 2.  Does student happiness change from the beginning of
#the semester to the end?


#import dataset
post<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/57279e33ebd396756764815f562fb31d/asset-v1:UTAustinX+UT.7.21x+2T2017+type@asset+block/PostSurvey.csv")

#1a. How many students are in the dataset?
glimpse(post)

#1b. What is the classification of the first male student? (Make sure your spelling matches the variable outcome as spelled in the dataframe.)
males<-subset(post, subset = gender == "Male")
head(males$classification)

#1c. Of the first 10 students in the dataset, what percentage live on campus? (Report without the "%" sign.)
first10<-post[1:10,]
## [1] 4.153728
live<-table(first10$live_campus)
## [1] 4.154
prop.table(live) * 100

#Check the Variables of Interest
## [1] 25
#2a. Which variable tells us whether a student is a lower-classman (freshman or sophomore)?

# The variable name in the dataset is classification ,
# which is a categorical variable.
# 
# 2b. Which variable tells us how happy students were at the
## [1] 1.7081
# beginning of the semester?
## [1] -1.708141
# 
## [1] 4.153728
# The variable name in the dataset is  happy , 
# which is a  quantitative variable.
# 
# 2c. Which variable tells us how happy students were at the
# end of the semester?
# 
# The variable name in the dataset is  post_happy,
# which is a  quantitative variable.


#3a. We will use an independent t-test to help us compare the 
#happiness of the under and upper-classmen. Why?
## [1] 25
#We want to compare the happiness of two different populations of students.

#3b. We will use a dependent t-test to help us determine whether
#happiness levels changed over the semester. Why?
## [1] 1.708
#We are looking for a change over time for the same group of students. correct

#Prepare for the Analysis
## [1] 1.44
# Let's break this analysis into its required steps:
# 
# Question 1:  Independent t-test
# 1. Make a vector of happiness scores for each sample (under- and upper-classmen).
# 2. Generate histograms to check the Normality assumption. 
## [1] 4.166667
# 3. Run an independent t-test.
## [1] 4.17
# 4. Interpret the results.
# 
# Question 2:  Dependent t-test
# 1. Make a vector of difference scores for student happiness from the beginning to the end of semester.
# 2. Generate a histogram of the difference scores to check the Normality assumption.
## [1] TRUE
# 3. Run a dependent t-test.
## [1] FALSE
# 4. Interpret the results.

#Here is the code you will use:

#Lab Question 1

# Make a vector of happiness scores for each sample
underclass_happy <- post$happy[post$classification=='Freshman'|post$classification=='Sophomore']
upperclass_happy <- post$happy[post$classification=='Junior'|post$classification=='Senior']

# Check the normality assumption
hist(underclass_happy, xlab='Underclassman Happiness', main='Percent of Time Happy')
hist(upperclass_happy, xlab='Upperclassman Happiness', main='Percent of Time Happy')

# Run independent t-test
t.test(underclass_happy, upperclass_happy)

#Lab Question 2

# Make a vector of difference scores
post$diff_happy <- post$happy - post$post_happy

# Check the normality assumption
hist(post$diff_happy, xlab= 'Difference in Happiness over the Semester', main = 'Happy-Post Happy')

# Run dependent t-test
t.test(post$happy, post$post_happy, paired=T)
## [1] 16
#. Which classifications of students are considered upperclassmen,
#according to the code above?
#juniors and seniors 

#2. How many sample means are being compared in the t-test
#for Lab Question 1?
#two

#3. What does this line of code do?
post$diff_happy <- post$happy - post$post_happy
#Creates a new variable for each student in the dataset 

#4. A student was happy 75% of the time at the beginning
#of the semester and 90% at the end of the semester.
#What will be the value of post$diff_happy for this student?
75-90

#Suppose we wanted to test the happiness scores of those who live
#on campus against those who live off campus. What has caused
#the error below?


on_campus <- post[post$live_campus == 'yes',]
off_campus <- post[post$live_campus == 'no',]
on_campus_happy <- on_campus$happy
off_campus_happy <- off_campus$happy
#t.test(on_campus_happy, off_campus_happy, paired = T)
#Error in complete.cases(x, y) : not all arguments have the same length

#ANSWER: We ran the wrong type of test.

#Conduct the Analysis
#Pre-Lab Question 1

#At the beginning of the semester:
## [1] 0.7631623
#1a. What percent of the time, on average, were underclassmen
#happy? (round to one decimal place)
## [1] 15
hist(underclass_happy)
round(mean(underclass_happy),1)
## [1] 4.062046
#1b. What percent of the time, on average,
#were upperclassmen happy? (round to one decimal place)
hist(upperclass_happy)
round(mean(upperclass_happy),1)
## [1] 1.75305
#Report test results, rounded to 3 decimal places:
#1c. t-statistic=
## [1] TRUE
# Run independent t-test
t.test(underclass_happy, upperclass_happy)
round(0.42302,3)

#1d. p-value=
p_value<-round(0.6748,3)

#e. We should fail to reject the null hypothesis that there is no
## 
##  Paired t-test
## 
## data:  left_side and right_side
## t = 4.062, df = 15, p-value = 0.001022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.473358 4.726642
## sample estimates:
## mean of the differences 
##                     3.1
#difference in happiness between under- and upper-classmen at the beginning
#of the semester.
## [1] 0.001021994
p_value >.05
## [1] TRUE
## [1] FALSE
#   A small p-value (typically ≤ 0.05) indicates strong
#evidence against the null hypothesis, so you reject the
#null hypothesis. (Your sample provides enough evidence that
#you can reject null hypothesis for the entire population).
#   A large p-value (> 0.05) indicates weak evidence against
#the null hypothesis, so you fail to reject the null hypothesis.

# Pre-Lab Question 2
## [1] 15
# 
# 2a. How many students were in this analysis?
#214
## [1] 1.753
#2b. The average change in happiness was 1.27%. Was this an
#increase or decrease over the semester?
#decrease

#Report test results, rounded to 3 decimal places:
## [1] 3.1
#2c. t-statistic=
t.test(post$happy, post$post_happy, paired=T)
## [1] 3.05
round(1.6838, 3)

#2d. p-value=
## [1] 0.76
p_value<-round(0.09368,3)

#2e. We should fail to reject the null hypothesis that there is no
## [1] 4.06
#change in happiness over the semester.

p_value >.05
p_value < .05
## [1] TRUE
#   A small p-value (typically ≤ 0.05) indicates strong
#evidence against the null hypothesis, so you reject the
#null hypothesis. (Your sample provides enough evidence that
#you can reject null hypothesis for the entire population).
#   A large p-value (> 0.05) indicates weak evidence against
#the null hypothesis, so you fail to reject the null hypothesis.
## 
##  Paired t-test
## 
## data:  left_side and right_side
## t = 4.062, df = 15, p-value = 0.001022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.473358 4.726642
## sample estimates:
## mean of the differences 
##                     3.1
#What were the shapes of these distributions?
#Pre-Lab Question 1
## [1] 1.5
## [1] 4.7
#3a. The distribution of underclassmen happiness was ________.
hist(underclass_happy)
#left-skewed

#3b. The distribution of upperclassmen happiness was ________.
## 
## Attaching package: 'MASS'
## The following object is masked _by_ '.GlobalEnv':
## 
##     survey
## The following object is masked from 'package:dplyr':
## 
##     select
hist(upperclass_happy)
##   Loc Var    Y1    Y2
## 1  UF   M  81.0  80.7
## 2  UF   S 105.4  82.3
## 3  UF   V 119.7  80.4
## 4  UF   T 109.7  87.2
## 5  UF   P  98.3  84.2
## 6   W   M 146.6 100.4
#left-skewed

#Pre-Lab Quesiton 2

#3c. The distribution of happiness difference scores was ________.
hist(post$diff_happy)
#nearly normal but with a slight positive skew
## 
##  Paired t-test
## 
## data:  immer$Y1 and immer$Y2
## t = 3.324, df = 29, p-value = 0.002413
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   6.121954 25.704713
## sample estimates:
## mean of the differences 
##                15.91333
#Write Your Conclusion
#Answer the question and support your answer with statistics:

# The average happiness scores for under-classmen (79.7%)
# and upper-classmen (78.3%) were not found to be significantly
# different (t=  .423 , p = 0.675). The distribution of scores
# for each group were negatively skewed. However, the sample
# sizes were both sufficiently large to meet the assumption
# for Normality. Over the semester, student happiness
# decreased by an average of 1.27%. 
# This correct a statistically significant change
# (t=  1.684 p=.094). Overall, there does not appear to be
# any significant difference in levels of student happiness
# based on their year in college, or the time of the semester.

#rm(list = ls())  # Clean up

#####################Week 3: Lab#################

#Review of Two-Sample t-Tests

#In this lab, you will use two-sample t-tests to answer
#a question of interest. Let's start by remembering why
#we use these hypothesis tests.

#INDEPENDENT SAMPLES: When we work with two independent samples,
#our null hypothesis assumes that if the samples are selected
#at random, the samples will vary only by chance and the
#difference will not be statistically significant. In short,
#when we have independent samples we assume that the scores
#of one sample do not affect the other.

#DEPENDENT SAMPLES: Two samples of data are dependent
#when each score in one sample is paired with a specific
#score in the other sample.

#Two samples are considered dependent when:
#each score in one sample is paired with a specific score in the other sample.


#Two samples are considered independent when:
#the scores of one sample do not affect the scores of the other sample

#Lab Preparation

post<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/57279e33ebd396756764815f562fb31d/asset-v1:UTAustinX+UT.7.21x+2T2017+type@asset+block/PostSurvey.csv")

#We will be answering each of the following questions in lab. Match each question to the type of t-test needed to run the analysis.

#Question 1: Do students at UT spend more time on
#homework per week in college than they did in
#high school?
#dependent t-test

#Question 2: Do students in fraternities and
#sororities get less sleep on the weekends than
#other college students?
#independent t-test

# NOTE:  If you are running directional hypotheses tests, remember that you must modify the code to reflect this direction.
# A one-sided test looks like this:   
# 
# t.test(Variable1, Variable2, alternative = 'less'), when you expect Mean1 < Mean2
# t.test(Variable1, Variable2, alternative = 'greater'), when you expect Mean1 > Mean2
# 

#Lab Question 1: Do students at UT spend more time on
#homework per week in college than they did in high school? (Dependent t test)

# Make a vector of difference scores
post$diff_hw_hours <- post$hw_hours_HS - post$hw_hours_college

# Check the normality assumption
hist(post$diff_hw_hours, xlab= 'Difference in Time Spent on Homework in HS & College ', main = 'difference in hours')

# Run dependent t-test
t.test(post$hw_hours_HS, post$hw_hours_college, paired=T)

#1a. On average, students spent how many hours more on homework each week in college than they did in high school? (round to 2 decimal)
round(10.94626,2)

#1b. What was the t-statistic for this test?
#(Round to 2 decimal places. Depending on how you solved this problem,
#your answer will be either negative or positive. Please report as a positive
#or absolute value.)
round(abs(-16.812),2)

#1c. How many degrees of freedom? (no decimal places)
213

#1d. What was the p-value?
## [1] 0.7582134
p_value <- 2.2e-16
p_value <= .05
## [1] 2
#strong evidence against the null hypothesis=reject the null
#Conclusion: Students at UT do spend more time on
#homework per week in college than they did in high school

#   A small p-value (typically ≤ 0.05) indicates strong
#evidence against the null hypothesis, so you reject the
#null hypothesis. (Your sample provides enough evidence that
#you can reject null hypothesis for the entire population).
#   A large p-value (> 0.05) indicates weak evidence against
#the null hypothesis, so you fail to reject the null hypothesis.

#Lab Question 2: Independent t test: Do students in fraternities and sororities
#get less sleep on the weekends than other college students?


# Make a vector of non greeks and greeks for each sample after splitting data
nongreek<-subset(post, subset = greek == "no")
greek<-subset(post, subset = greek == "yes")

nongreek_sleep <- nongreek$sleep_Sat
greek_sleep<- greek$sleep_Sat

# Check the normality assumption
## [1] 5.991465
hist(nongreek_sleep, xlab='Non-greek', main='Non-greek Weekend Sleep Hours')
hist(greek_sleep, xlab='Greek', main='Greek Weekend Sleep Hours')
## [1] 0.6844726
# Run independent t-test// when you expect Mean1 > Mean2
t.test(nongreek_sleep, greek_sleep, alternative = "greater")
## [1] FALSE
## [1] TRUE
#2a. On average, students who are Greek sleep how many hours less
#than Non-Greek students on Saturday nights? (report to 1 decimal place)
round(8.04264 - 7.727273,1)

#2b. What is the t-statistic for this test?
#(Report to 3 decimal places. Depending on how you
#solved this problem, your answer will be either
#negative or positive. Please report as a positive or absolute value.
round(0.98077,3)

#2c. How many degrees of freedom? (round to no decimal places)
round(62.948)

#2d. What was the p-value? (report to 3 decimal places)
p_value<-0.1652
round(0.1652,3)

#2e. Based on these results, we would conclude that people who
#are in fraternities or sororities ________ get less sleep on the
#weekends than other college students.

p_value <= .05
p_value>.05
#fail to reject H0

#   A small p-value (typically ≤ 0.05) indicates strong
#evidence against the null hypothesis, so you reject the
#null hypothesis. (Your sample provides enough evidence that
#you can reject null hypothesis for the entire population).
#   A large p-value (> 0.05) indicates weak evidence against
#the null hypothesis, so you fail to reject the null hypothesis.

#3. The Normality assumption was met in each hypothesis test.

#Write Your Conclusion
## 
##  Chi-squared test for given probabilities
## 
## data:  event
## X-squared = 3.4667, df = 2, p-value = 0.1767
#Answer the question and support your answer with statistics:

# The average amount of difference in the time that UT students spent on
# homework in high school versus college was found to be
# approximately normal . The difference showed that, in college,
# students spend 10.95 hours more on homework per week than in high
## [1] 2
# school. This difference is significant (t(213)= 16.81 , p<.0.05).
# 
# The distributions for the amount of sleep for students in
# either fraternities and sororities and other students are both
# approximately normal There was not a significant difference between
## [1] 3.470227
# the amount of Saturday night sleep for the students who are in
# the Greek system (mean=7.73) and those who are not (mean=8.04),
# with a t-statistic of -0.981 ( degrees of freedom =62.95) and a p-value
# of 0.1652 .

###########Week 3: Problem Set############################

# Question 1
# 
# Is the increase in time spent studying from high school to
#college the same for nursing majors and biology majors?  
## [1] 29.97
# 
# 1. Create a new variable that equals the difference in hours
#spent studying per week in college versus high school for each student. 
## [1] 29.97
# 
# 2. Create two vectors of those differences, one for nursing
#majors and one for biology majors.
## [1] 29.97
# 
# 3. Use this data to answer the following questions.  
## [1] 2
# difference in hours
#spent studying per week in college versus high school for each student
post$diff_hw_hours <- post$hw_hours_HS - post$hw_hours_college
## 
##  Chi-squared test for given probabilities
## 
## data:  event
## X-squared = 3.4667, df = 2, p-value = 0.1767
## [1] 3.47
#two vectors of those differences, one for nursing
#majors and one for biology majors.

biology<-post[ which(post$major=="Biology"),]
## [1] 5.99
nursing<-post[ which(post$major=="Nursing"),]

biology_majors<-biology$diff_hw_hours
nursing_majors<-nursing$diff_hw_hours
## [1] 0.1763802
# Check the normality assumption
hist(biology_majors, xlab='biology', main='Difference in hours Biology Majors')
## [1] FALSE
hist(nursing_majors, xlab='nursing', main='Difference in Hours Nursing Majors')
## [1] TRUE
# Run independent t-test
t.test(biology_majors, nursing_majors)
p_value<-0.5379

p_value <= .05 #reject null hypothesis if true
p_value>.05 #fail to reject null hypothesis if true
#fail to reject H0

#   A small p-value (typically ≤ 0.05) indicates strong
#evidence against the null hypothesis, so you reject the
#null hypothesis. (Your sample provides enough evidence that
#you can reject null hypothesis for the entire population).
#   A large p-value (> 0.05) indicates weak evidence against
#the null hypothesis, so you fail to reject the null hypothesis.



#1a. Which of the following methods should be used to answer the question
#above?
#Two-sample independent t-test correct

#1b. Create a histogram to confirm the normality assumption for
#each sample. Has the normality assumption been met?
hist(biology_majors, xlab='biology', main='Difference in hours Biology Majors')
hist(nursing_majors, xlab='nursing', main='Difference in Hours Nursing Majors')
#yes
##   exp1 exp2 exp3
## 1   16   36   28
#1c. Run the appropriate t-test for this analysis.
#What is the t-statistic? (Report as a positive number
#rounded 2 decimal places.)
t.test(biology_majors, nursing_majors)
round(0.62301,2)

#1d. How many degrees of freedom are there for this test? (Round to 2 decimal places.)
round(30.886,2)
## [1] 1.571429
#1e. What is the p-value for this test? (Round to 2 decimal places.)
round(0.5379,2)
## [1] 2
#1f. Which of the following is an appropriate conclusion for
#this analysis (assuming α = .05)?
#We fail to reject the null hypothesis; the increase in study time
#is the same for biology and nursing majors
## [1] 0.455794
#Question 2
## [1] FALSE
## [1] TRUE
#A study was conducted to compare the resting pulse rates of
#college smokers and non-smokers.  The data for a randomly selected
#group is summarized in the table below. Pulse rates were normally
#distributed within each group.
#left-tail
# H0 𝜇:smokers ≤ nonsmokers
# Ha 𝜇:smokers > nonsmokers

#smokers
sample1_mean<-80
sample1_sd<-5
n1<-26
pop_mean1<-0

#non Smokers
sample2_mean<-74
sample2_sd<-6
n2<-32
pop_mean2<-0

sem<-sqrt(((sample1_sd*sample1_sd)/n1) + ((sample2_sd*sample2_sd)/n2))
t_test<-print(((sample1_mean-sample2_mean) - (pop_mean1-pop_mean2))/sem)
round(t_test,3)

#1d. What is the t-critical value for this test, assuming df = nsmallest - 1 and α=0.05?
degreesFreedom<-print(min(n1,n2)-1)
#browseURL("https://www.stat.wisc.edu/courses/st371-hanlon/hand/table/tTable.pdf")
#     We know that We know that our smallest group has just 26, so
#     the freedom for this test is 25. 25 degrees of
#     freedom at the 0.05 significance level gives us a critical
#     value of 1.7081. (on the lower 1.7081)

criticalValue<- print(1.7081)
qt(.05, df = degreesFreedom) #R method: we always have a positive critical value
## [1] 16
t_test

#2a. What is the appropriate method for analyzing this data?
## [1] 36
#independent t-test

#2b. What is the alternative hypothesis for this test
## [1] 28
#if the researchers expect smoking to raise pulse rates?
# H0 𝜇:smokers ≤ nonsmokers
# Ha 𝜇:smokers > nonsmokers

#2c. How many degrees of freedom should we use for this test if
#we are to estimate rather than use a calculator?
## [1] 1.571
25

#2d. What is t-critical, assuming α=0.05? (Round to 3 decimal places.
## [1] 2
#Use your answer to Question 2c to help find the answer.)
abs(round(qt(.05, df = degreesFreedom),3)) #R method: we always have a positive critical value

#2e. Calculate the standard error. (Round to 2 decimal places.)
## [1] 5.99
sem<-sqrt(((sample1_sd*sample1_sd)/n1) + ((sample2_sd*sample2_sd)/n2))
round(sem,2)

#2f. Calculate the test statistic.
## [1] 0.455794
#(Round to 2 decimal places, and use rounded values
## [1] TRUE
#from previous answers.)
t_test<-print(((sample1_mean-sample2_mean) - (pop_mean1-pop_mean2))/1.44)
round(t_test,2)

#2g. Is there evidence to suggest that the pulse rate of smokers
#is higher on average than the pulse rate of non-smokers?
p_value<-2*pt(-abs(t_test),df=degreesFreedom, lower.tail = TRUE)
p_value <= .05 #reject null hypothesis if true
p_value>.05 #fail to reject null hypothesis if true
#Reject H0
#Answer: YES

#2g. How would the p-value be reported in your conclusion?
#p < 0.05
##         men women
## fear     68   109
## no fear  94    89
#rm(list = ls())  # Clean up
##         men women Sum
## fear     68   109 177
## no fear  94    89 183
## Sum     162   198 360
##               men     women
## fear    0.1888889 0.3027778
## no fear 0.2611111 0.2472222
##               men     women       Sum
## fear    0.1888889 0.3027778 0.4916667
## no fear 0.2611111 0.2472222 0.5083333
## Sum     0.4500000 0.5500000 1.0000000
#Question 3
##               men     women
## fear    0.3841808 0.6158192
## no fear 0.5136612 0.4863388
##           men women
## fear    41.98 55.05
## no fear 58.02 44.95
# Some nerve cells have the ability to regenerate. Researchers think
# that these cells may generate creatine phosphate (CP) to stimulate
# new cell growth.
# 
# To test this hypothesis, researchers cut the nerves emanating
# from the left side of the spinal cord in a sample of rhesus monkeys,
## 
##  Pearson's Chi-squared test
## 
## data:  heights
## X-squared = 6.0947, df = 1, p-value = 0.01356
# while the nerves on the right side were kept intact.  They then
# compared the CP levels (mg/100g) in nerve cells on both sides. 
## Number of cases in table: 360 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 6.095, df = 1, p-value = 0.01356
#Dependent t-test
left_side<-c(16.3, 4.8, 10.7, 14.0, 15.7, 9.9, 29.3, 20.4, 
  15.7, 7.6, 16.2, 14.7, 15.0, 8.4, 23.3, 17.7) #regenerating
right_side<-c(11.5, 3.5, 12.8, 7.9, 15.2, 9.8, 24.0, 14.9,
  12.6, 8.2, 8.4, 11.0, 12.5, 9.2, 17.5, 11.1) #the control
cp_levels<-data.frame(cbind(left_side, right_side))
cp_levels$diff_cp<-cp_levels$left_side - cp_levels$right_side
n<-print(length(cp_levels$diff_cp))
##           men  women
## fear    79.65  97.35
## no fear 82.35 100.65
sd_difference<-sd(cp_levels$diff_cp)
mean_difference<-abs(mean(cp_levels$diff_cp))

# NOTE:  If you are running directional hypotheses tests,
## [1] 79.65
#remember that you must modify the code to reflect this direction.
# A one-sided test looks like this:   
## [1] 97.35
# 
# t.test(Variable1, Variable2, alternative = 'less'), when you expect Mean1 < Mean2
# t.test(Variable1, Variable2, alternative = 'greater'), when you expect Mean1 > Mean2
## [1] 82.35
# 

#Assume the researchers calculated the difference scores as d = CPleft - CPright.
## [1] 100.65
#They set α = 0.05. 
##         men women
## fear     68   109
## no fear  94    89
#Hypothesis Step 1: Clearly state the Null and Alternative Hypothesis.
##           men  women
## fear    79.65  97.35
## no fear 82.35 100.65
# H0 𝜇left ≤right
# Ha 𝜇left >right


#Hypothesis Step 2: Identify the appropriate significance level
#and confirm the test assumptions.
# assume the standard significance level of 0.05
# students are random and independent
# distribution of all possible difference scores in the population are nearly normally distributed.
hist(cp_levels$diff_cp)
## [1] 0.492
##               men     women       Sum
## fear    0.1888889 0.3027778 0.4916667
## no fear 0.2611111 0.2472222 0.5083333
## Sum     0.4500000 0.5500000 1.0000000
#Hypothesis Step 3: Analyze the data.
## [1] 0.42
#3A: Calculate the Standard Error
##           men women
## fear    41.98 55.05
## no fear 58.02 44.95
standard_error<-print(sd_difference/sqrt(n))

#3B: Calculate degrees of freedom
## [1] 0.551
degreesFreedom<-print(n-1)
##           men women
## fear    41.98 55.05
## no fear 58.02 44.95
#3C: Calculate the t-statistic
t_test<-print((mean_difference-0)/standard_error)

#3D:Calculate Critical Value
critical_value<-qt(.05, df = degreesFreedom) #R method: we always have a positive critical value
abs(critical_value)

#Hypothesis Step 4: Interpret your results.
t_test>critical_value #if true, reject null hypothesis
## Classes 'data.table' and 'data.frame':   116 obs. of  14 variables:
##  $ Artist       : chr  "Aimee Mann" "Alabama Shakes" "Allen Toussaint" "Andrew Bird" ...
##  $ Year         : int  2008 2013 2009 2009 2007 2009 2010 2009 2003 2008 ...
##  $ Month        : chr  "November" "February" "January" "October" ...
##  $ Season       : chr  "fall" "winter" "winter" "fall" ...
##  $ Gender       : chr  "F" "F" "M" "M" ...
##  $ Age          : int  52 24 75 39 33 62 37 35 43 67 ...
##  $ Age.Group    : chr  "Fifties or Older" "Twenties" "Fifties or Older" "Thirties" ...
##  $ Grammy       : chr  "Y" "N" "N" "N" ...
##  $ Genre        : chr  "Singer-Songwriter" "Rock/Folk/Indie" "Jazz/Blues" "Rock/Folk/Indie" ...
##  $ BB.wk.top10  : int  0 1 NA 1 1 0 1 NA 1 0 ...
##  $ Twitter      : int  101870 73313 308634 56343 404439 3326 125758 8197 158647 690 ...
##  $ Twitter.100k : int  1 0 1 0 1 0 1 0 1 0 ...
##  $ Facebook     : int  113576 298278 10721 318313 1711685 27321 563505 18955 1381051 1715 ...
##  $ Facebook.100k: int  1 1 0 1 1 0 1 0 1 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>
#As our calculated t-statistic is greater than our
#t-critical value (our value lies in the critical region),
#we reject our Null hypothesis and conclude that there was in fact
#a difference between the left and right sides.

# Run dependent t-test using base R
t.test(left_side, right_side, paired = TRUE)
## 
##  N  Y 
## 67 49
p_value<-2*pt(-abs(t_test),df=degreesFreedom)
## 
##   N   Y Sum 
##  67  49 116
p_value
## 
##         N         Y 
## 0.5775862 0.4224138
p_value <= .05 #reject null hypothesis if true
## 
##   N   Y Sum 
##  58  42 100
p_value>.05 #fail to reject null hypothesis if true
#Reject the null hypothesis

#3a. Which is the appropriate method for testing the researchers'
#hypothesis?
#Dependent t-test

#3b. How many degrees of freedom are there?
degreesFreedom

#3c. What is the t-critical value? (Round to 3 decimal places.)
abs(round(critical_value,3)) #we always have a positive critical value

#3d. How much of a difference in creatine phosphate
#was observed, on average, between the left and
#right nerve cells? (Report as a positive value
#rounded 1 decimal place).
round(mean_difference,1)

#3e. What is the Standard Deviation of the difference scores? (Round to 2 decimal places.)
round(sd_difference,2)
## 
##  Chi-squared test for given probabilities
## 
## data:  gtab
## X-squared = 4.1422, df = 1, p-value = 0.04183
#3f. What is the Standard Error for your t-test? (Round to 2 decimal places.)
round(standard_error,2)

#3g. What is your test statistic? (Round to 2 decimal places.)
round(t_test,2)

#3h. Is there sufficient evidence to suggest that the levels of
#creatine phosphate are higher in regenerating cells than they
#are in the normal (control) cells?
p_value <= .05 #reject null hypothesis
#Yes
##        N        Y 
## 77.33333 38.66667
#3i. The researchers finish their analysis by calculating a
#95% confidence interval for the true increase in CP levels
#in rejuvenating nerve cells. What are the lower and upper
#bounds? (Round each to 1 decimal place.)
t.test(left_side, right_side, paired = TRUE)

lowerBound<-print(round(1.473358,1))
upperBound<-print(round(4.726642,1))

#rm(list = ls())  # Clean up

#########Additional Paired t test examples##################
library(MASS)
head(immer)

#Problem
#Assuming that the data in immer follows the normal distribution,
#find the 95% confidence interval estimate of the difference between
##             Artist Year   Month Season Gender Age        Age.Group Grammy
## 1: Allen Toussaint 2009 January winter      M  75 Fifties or Older      N
##         Genre BB.wk.top10 Twitter Twitter.100k Facebook Facebook.100k
## 1: Jazz/Blues          NA  308634            1    10721             0
#the mean barley yields between years 1931 and 1932.

t.test(immer$Y1, immer$Y2, paired=TRUE) 

#Between years 1931 and 1932 in the data set immer, the 95%
#confidence interval of the difference in means of the barley 
#yields is the interval between 6.122 and 25.705.
## [1] 1
##############Week 4: Chi-Square##########################

# The primary difference between a chi-square test and the tests
# we have worked with before is that chi square tests are for used
# for categorical data. The chi-square test can be used to
##  [1] "Artist"        "Year"          "Month"         "Season"       
##  [5] "Gender"        "Age"           "Age.Group"     "Grammy"       
##  [9] "Genre"         "BB.wk.top10"   "Twitter"       "Twitter.100k" 
## [13] "Facebook"      "Facebook.100k"
# estimate how closely the distribution of a categorical variable
# matches an expected distribution (the goodness-of-fit test),
# or to estimate whether two categorical variables are independent
# of one another (the test of independence). 
# 
# The null and alternative hypotheses reflect this focus:
#   
# H0 : The population distribution of the variable is the
# same as the proposed distribution
# HA : The distributions are different
# 
# Observed = actual count values in each category
# Expected = the predicted (expected) counts in each category
# if the null hypothesis were true
# 
# Assumptions of the Chi-Square test
# The assumptions of the chi-square test are the same whether
# we are using the goodness-of-fit or the test-of-independence.
# The standard assumptions are:
# • Random sample/assignment
# • Independent observations for the sample (one observation per subject)
#    each case only contributes to one cell in the table. n < 10% of population.
# • No expected counts less than five.
# --each cell must have at least 5 expected cases
# 
# 
# Notice that the last two assumptions are concerned with the
# expected counts, not the raw observed counts.

# Goodness of fit is determining how well the expected numbers
# match the observed numbers

# Example A
# The American Pet Products Association conducted a survey in
# 2011 and determined that 60% of dog owners have only one dog,
# 28% have two dogs, and 12% have three or more. Supposing that
# you have decided to conduct your own survey and have collected 
# the data below, determine whether your data supports the results
# of the APPA study. Use a significance level of 0.05.
# 
# Data: Out of 129 dog owners, 73 had one dog and 38 had two dogs.
## 
##  F  M 
## 35 81
# 
# Solution
# • Step 1: Clearly state the null and alternative hypotheses.
# H0: The proportion o fdogowners with one, two or three dogs is 
# 0.60, 0.28 and 0.12 respectively.
# 
##  F  M 
## 58 58
# H1: The proportion of dog owners with one, two or three dogs
# does not match the proposed model.
# 
## 
##  Chi-squared test for given probabilities
## 
## data:  gender_tab
## X-squared = 18.241, df = 1, p-value = 1.946e-05
# • Step 2: Identify an appropriate test and significance level.
# A Chi-Square goodness of fit test is appropriate because we are
# examining the distribution of a single categorical variable.
# In the absence of a stated significance level in the problem,
# we assume the default 0.05.
# 
# • Step 3: Analyze sample data.
##    
##      0  1
##   F 15 18
##   M 38 32
#Create a table to organize data and compare the observed data to the expected data:
#browseURL("http://www.cyclismo.org/tutorial/R/tables.html") #two-way tables
##    
##            0        1
##   F 16.98058 16.01942
##   M 36.01942 33.98058
ob1<-73
ob2<-38
## 
##  Pearson's Chi-squared test
## 
## data:  gender_top10
## X-squared = 0.70023, df = 1, p-value = 0.4027
ob3<-18
obtotal<- ob1 + ob2 + ob3 #make sure counts add up to sample size

#To identify the expected values, multiply the
#expected % by the total number observed:
#0.60, 0.28 and 0.12 
exp1<-.60 * obtotal
exp2<-.28 * obtotal
exp3<-.12 * obtotal

#To calculate our chi-square statistic,
#we need to sum the squared difference between
#each observed and expected value divided by the
#expected value:
chiSquare<-((ob1-exp1)*(ob1-exp1)/exp1) +
  ((ob2-exp2)*(ob2-exp2)/exp2) +
  ((ob3-exp3)*(ob3-exp3)/exp3)
chiSquare

degreesFreedom<-print(3-1) #how many groups/levels - 1

#P-VALUE (JUST LIKE IN AN F-TEST)

#p-value for a chi-square test is defined as the tail area aove
#the calculated test statistic

#because the test statistic is always positive, and a higher
#statistic means a higher deviation from the null hypothesis.

# #Now that we have our chi-square statistic, we need
# to compare it to the chi-square value for the significance
# level 0.05. We can use a reference table. Just as
# with the T-tests, we will need to know the degrees
# of freedom, which equal the number of observed category
# values minus one. In this case, there are three categor
# y values: one dog, two dogs, and three or more dogs.
# The degrees for freedom, therefore, are 3 − 1 = 2.

#browseURL("https://people.richland.edu/james/lecture/m170/tbl-chi.html")
## 
##  F  M 
## 35 81
#browseURL("http://bitly.com/di") # or use applet

criticalValue<-print(qchisq(.05, df = degreesFreedom,
##    F    M 
## 34.8 81.2
  lower.tail = FALSE)) #R method: we always have a positive critical value

p_value<-pchisq(0.7582134, 2, lower.tail = FALSE) #WE ALWAYS WANT UPPER TAIL--x is x squared
## 
##  Chi-squared test for given probabilities
## 
## data:  gender_tab
## X-squared = 0.001642, df = 1, p-value = 0.9677
p_value
## [1] 18.24
p_value <= .05 #reject null hypothesis if true
p_value > .05 #fail to reject null hypothesis if true

#Using the table, we find that the critical value for a
# 0.05 significance level with d f = 2 is 5.99.
# That means that 95 times out of 100, a survey that
# agrees with a sample will have a χ2 value of 5.99
## [1] TRUE
# or less. If our chi-square value is greater than 5.995,
## [1] FALSE
# then the measurements we took only occur 5 or fewer
# times out of 100, or the null hypothesis is incorrect.
# Our chi-square statistic is only 0.7533, so we will
# not reject the null hypothesis.

#ANSWER: we can say that our survey data does support the data from the APPA.
##    
##      0  1
##   F 15 18
##   M 38 32
#rm(list = ls())  # Clean up
##      
##         0   1 Sum
##   F    15  18  33
##   M    38  32  70
##   Sum  53  50 103
## [1] 32
###########Lecture Videos: Chi-Square Goodness-of-fit, Part One#######################
# Choose the statement that is NOT true about the Chi-square test.
## [1] 18
# ANSWER: Values for chi square can be negative, positive or zero
## [1] 13
#2. A major snack food company claims that its chips are "America's favorite.
#" A statistics class tests this claim by asking a sample of 90 random
#students on campus to select their favorite chip from the company's
## 
##  Pearson's Chi-squared test
## 
## data:  gender_top10
## X-squared = 0.70023, df = 1, p-value = 0.4027
#(Brand A) and two other brands (Brand B and Brand C). Below are
## [1] 0.7
#the results of how many students selected each brand in their taste
#test.
## [1] 0.403
brandA <- 38
brandB <- 28
brandC <- 24
obtotal <- brandA + brandB + brandC
## [1] 0.403
## [1] FALSE
event<-data.frame(cbind(brandA, brandB, brandC))
## [1] TRUE
chisq.test(event)

exp1<-.333 * obtotal
exp2<-.333 * obtotal
##    
##             0         1
##   F 0.4545455 0.5454545
##   M 0.5428571 0.4571429
exp3<-.333 * obtotal
##      
##               0         1       Sum
##   F   0.4545455 0.5454545 1.0000000
##   M   0.5428571 0.4571429 1.0000000
##   Sum 0.9974026 1.0025974 2.0000000
degreesFreedom<-print(3-1) #how many groups/levels - 1
## 
##  0  1 
## 53 50
## 
##         0         1 
## 0.5145631 0.4854369
chiSquare<-((brandA-exp1)*(brandA-exp1)/exp1) +
  ((brandB-exp2)*(brandB-exp2)/exp2) +
  ((brandC-exp3)*(brandC-exp3)/exp3)
chiSquare

# H0 : The population distribution of the variable is the
# same as the proposed distribution--there is nothing going on
# HA : The distributions are different

##2a. What is the distribution model for the null hypothesis?
#33.3% for all three categories.

#2b. Input or identify the following values necessary for this chi-square test. Assume a confidence level of 0.05.
#Expected value of Brand A:(Report as a whole number.)
exp1<-print(.333 * obtotal)

#Expected value of Brand B:(Report as a whole number.)
exp2<-print(.333 * obtotal)

#Expected value of Brand C:(Report as a whole number.)
exp3<-print(.333 * obtotal)

#Degrees of freedom:(Report as a whole number.)
degreesFreedom<-print(3-1) #how many groups/levels - 1

#Chi-square statistic:(Rounded to 2 decimal places.)
chisq.test(event)
round(3.4667,2)

#Chi-square critical value:(Rounded to 2 decimal places.)
#browseURL("https://people.richland.edu/james/lecture/m170/tbl-chi.html")
round(5.991,2)

#2c. What conclusion should be drawn about the popularity of Brand A?

p_value<-pchisq(chiSquare, 2, lower.tail = FALSE) #WE ALWAYS WANT UPPER TAIL--x is x squared
p_value

p_value <= .05 #reject null hypothesis if true
p_value > .05 #fail to reject null hypothesis if true
#Fail to reject null hypothesis since p-value is .1767
#ANSWER: There is no evidence to suggest that Americans prefer Brand A to the other brands tested.

#rm(list = ls())  # Clean up

###########Lecture Videos: Chi-Square Goodness-of-fit, Part Two#######################
#1. Jurors are selected from the list of registered voters,
#so the ages for jurors should have the same distribution as
#the ages of voters. A law professor obtains voter registration
#records and finds that 20% of registered voters are 18-29, 45%
#are 30-49, and 35% are age 50 or older. The professor then
#monitors jury composition over a month-long period and finds
#the following distribution of jurors:  

ob1<-12
ob2<-36
ob3<-32
obtotal<- ob1 + ob2 + ob3 #make sure counts add up to sample size

#To identify the expected values, multiply the
#expected % by the total number observed:
#20% of registered voters are 18-29, 45%
#are 30-49, and 35% are age 50 or older.
exp1<-.20 * obtotal
## 
##           Country        Jazz/Blues   Rock/Folk/Indie Singer-Songwriter 
##                18                13                68                17
exp2<-.45 * obtotal
exp3<-.35 * obtotal
expected<-data.frame(cbind(exp1, exp2, exp3))
expected

#To calculate our chi-square statistic,
##           Country        Jazz/Blues   Rock/Folk/Indie Singer-Songwriter 
##                29                29                29                29
#we need to sum the squared difference between
#each observed and expected value divided by the
#expected value:
## 
##  Chi-squared test for given probabilities
## 
## data:  genre_tab
## X-squared = 70.414, df = 3, p-value = 3.481e-15
chiSquare<-((ob1-exp1)*(ob1-exp1)/exp1) +
  ((ob2-exp2)*(ob2-exp2)/exp2) +
  ((ob3-exp3)*(ob3-exp3)/exp3)
chiSquare
##           Country        Jazz/Blues   Rock/Folk/Indie Singer-Songwriter 
##                29                29                29                29
degreesFreedom<-print(3-1) #how many groups/levels - 1
## 
##  Chi-squared test for given probabilities
## 
## data:  genre_tab
## X-squared = 70.414, df = 3, p-value = 3.481e-15
event<-data.frame(cbind(ob1, ob2, ob3))
## [1] 70.41
#chisq.test(event)

p_value<-pchisq(chiSquare, 2, lower.tail = FALSE) #WE ALWAYS WANT UPPER TAIL--x is x squared
## [1] 3
p_value
## [1] 3.481e-15
p_value <= .05 #reject null hypothesis if true
## [1] TRUE
p_value > .05 #fail to reject null hypothesis if true
## [1] FALSE
#Fail to reject null hypothesis since p-value is above .05 ---.455794
#CONCLUSION: There is no evidence to suggest that jurors are not evenly distributed
#by age.


#1a. We want to match the distribution of a categorical variable to a hypothesized distribution model. Which Chi Square test should we run?
#Goodness of fit

#1b. The null hypothesis is H0: The ages of jurors are 
##                    
##                      0  1
##   Country           11  6
##   Jazz/Blues         9  2
##   Rock/Folk/Indie   33 26
##   Singer-Songwriter  6 10
#distributed the same way as the ages of voters. What
#is the alternative hypothesis?
## Warning in chisq.test(genre_twitter, correct = FALSE): Chi-squared
## approximation may be incorrect
##                    
##                             0         1
##   Country            9.737864  7.262136
##   Jazz/Blues         6.300971  4.699029
##   Rock/Folk/Indie   33.796117 25.203883
##   Singer-Songwriter  9.165049  6.834951
# H0 : The population distribution of the variable is the
# same as the proposed distribution--
#20% of registered voters are 18-29, 45% are 30-49, and
## Warning in chisq.test(genre_twitter, correct = FALSE): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  genre_twitter
## X-squared = 5.6919, df = 3, p-value = 0.1276
#35% are age 50 or older (i.e. there is nothing going on)
# HA : The distributions are different

#ANSWER: The age of jurors are not distributed the
#same way as the ages of voters. 
##                    
##                      0  1
##   Country           11  6
##   Jazz/Blues         9  2
##   Rock/Folk/Indie   33 26
##   Singer-Songwriter  6 10
##                    
##                       0   1 Sum
##   Country            11   6  17
##   Jazz/Blues          9   2  11
##   Rock/Folk/Indie    33  26  59
##   Singer-Songwriter   6  10  16
##   Sum                59  44 103
#1c. One law student divides the total number of
##                    
##                             0         1
##   Country           0.6470588 0.3529412
##   Jazz/Blues        0.8181818 0.1818182
##   Rock/Folk/Indie   0.5593220 0.4406780
##   Singer-Songwriter 0.3750000 0.6250000
#jurors by the number of categories and comes up with
##                    
##                         0     1   Sum
##   Country           0.647 0.353 1.000
##   Jazz/Blues        0.818 0.182 1.000
##   Rock/Folk/Indie   0.559 0.441 1.000
##   Singer-Songwriter 0.375 0.625 1.000
##   Sum               2.400 1.600 4.000
#the following expected values for each category:
#18-29: 26.67
#30-49: 29.67
## [1] 0.353
#50+: 29.67
#What did he do wrong?
## [1] 0.182
#He assumed the jurors should be evenly distributed across the categories, but this is not the hypothesized model
## [1] 0.441
#1d. Find the expected values for each category, assuming the age distributions
#of jurors and voters are the same.
## [1] 0.625
#18-29 years old:(Report as a whole number.)
round(exp1)
## [1] 5.69
#30-49 years old:(Report as a whole number.)
round(exp2)
## [1] 3
#50 years or older:(Report as a whole number.)
round(exp3)
## [1] 0.1276
#1e. Find the chi-square statistic using the following formula:
## [1] FALSE
chiSquare<-((ob1-exp1)*(ob1-exp1)/exp1) +
## [1] TRUE
  ((ob2-exp2)*(ob2-exp2)/exp2) +
  ((ob3-exp3)*(ob3-exp3)/exp3)
round(chiSquare,3)

#1f. What is the degrees of freedom for this hypothesis test?
degreesFreedom<-print(3-1) #how many groups/levels - 1

#1g. What is the critical Chi-square value for this hypothesis test? (Rounded to 2 decimal places)
#browseURL("https://people.richland.edu/james/lecture/m170/tbl-chi.html")
round(5.991,2)

#1h. Based on your above answers, you should __________ the null hypothesis.
p_value<-pchisq(chiSquare, 2, lower.tail = FALSE) #WE ALWAYS WANT UPPER TAIL--x is x squared
p_value
p_value > .05 #fail to reject null hypothesis if true
#Answer: Fail to reject

#1i. What is the appropriate interpretation of this hypothesis test?
#We have no evidence to suggest that the age distributions of jurors are any different than those of registered voters.

#rm(list = ls())  # Clean up

###########Lecture Videos: Chi-Square Test-of-Independence#######################
#1. Here is the data from a research study on the relationship between sex and fear of heights.

heights<-matrix(c(68,109,94,89), ncol = 2, byrow = TRUE)
colnames(heights) <- c("men", "women")
rownames(heights) <- c("fear", "no fear")
heights<-as.table(heights)
heights
addmargins(heights)
prop.table(heights) #entire proportions
addmargins(prop.table(heights))
prop.table(heights, margin = 1) #fear vs no fear prop
round(prop.table(heights, margin = 2)*100,2) # gender prop

chiSquare <- 6.09
p_value <- 0.01356

# Run test of independence
chisq.test(heights, correct=FALSE)

summary(heights)
#If you want to do a chi-squared test to determine if the proportions are different, there is an easy way to do this. If we want to test at the 95% confidence level we need only look at a summary of the table:

# H0 : The population distribution of the variable is the
# same as the proposed distribution--
##    
##      0  1
##   F 23 12
##   M 65 16
# (i.e. there is nothing going on)
##      
##         0   1 Sum
##   F    23  12  35
##   M    65  16  81
##   Sum  88  28 116
# HA : The distributions are different

#1a. What are the expected values for each of the following groups?
##    
##            0         1
##   F 26.55172  8.448276
##   M 61.44828 19.551724
chisq.test(heights, correct=FALSE)$expected
## 
##  Pearson's Chi-squared test
## 
## data:  recent_gender_tab
## X-squared = 2.8188, df = 1, p-value = 0.09317
#Men who expressed a fear of heights:(Round to 2 decimal places.)
79.65
#Women who expressed a fear of heights:(Round to 2 decimal places.)
97.35

#Men who did not express a fear of heights:(Round to 2 decimal places.)
82.35

#Women who did not express a fear of heights:(Round to 2 decimal places.
100.65

#1b. Which of the following best summarizes how the observed values compare to the expected values?
heights
chisq.test(heights, correct=FALSE)$expected
#ANSWER: Fewer men expressed a fear of height than expected,
#and more women expected a fear of heights than expected.

#1c. The researchers found a significant difference between men
#and women. It appears that sex is related to a fear of heights.
#We should see that difference reflected in the conditional probabilities.
#Fill in those values below.

#Proportion of the entire sample that has a fear of heights:
#(Round to 3 decimal places.)
round(177/360,3)
## [1] 2.82
addmargins(prop.table(heights))

#Proportion of the men that have a fear of heights:(Round to 3 decimal places.)
## [1] 0.09
round(68/162,3)
round(prop.table(heights, margin = 2)*100,2) # gender prop
## [1] 0.09317
#Proportion of the women that have a fear of heights:(Round to 3 decimal places.)
## [1] FALSE
round(109/198,3)
## [1] TRUE
round(prop.table(heights, margin = 2)*100,2) # gender prop

#rm(list = ls())  # Clean up

#############Chi-Squared Goodness of Fit Test#####################

#import data
acl<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/2bad2d0d8a5d13fcc764f30b5ebcc9c5/asset-v1:UTAustinX+UT.7.11x+2T2017+type@asset+block/AustinCityLimits.csv")

#inspect data
str(acl)

# So suppose we hear a claim that 1/3 of all the ACL Live artists have won a Grammy. We can test this claim
# with a Chi-squared Goodness-of-Fit test. First, we’re going to need to make a table of the actual counts
# of our yes’s and no’s in our Grammy variable.
# So let’s call this table gtab and just assign it as the table of

gtab<-table(acl$Grammy)
gtab
addmargins(gtab)
prop.table(gtab)
##   ob1 ob2 ob3
## 1 152  39  14
round(addmargins(prop.table(gtab)),2)*100


# So the next step we have to take is actually create a vector that holds
# the claimed proportions of yes’s and no’s. So if the claim was that 1/3
# of ACL Live performers have won a Grammy,
# that means that 2/3 haven’t. So I’m going to make a vector called
# claimp by using my concatenate function.
# Again, that’s just a lowercase c. And then I’m just going to give
# it some items to concatenate into a vector.
# So my claim proportion for the no category, which appears
##     exp1  exp2 exp3
## 1 153.75 30.75 20.5
# first alphabetically, is 2/3. And then my second
# claim proportion is my yes category, 1/3. 
# So just be careful to keep it in the correct order that it appears
# when you use the table function.
claimp <- c(2/3, 1/3)

# Now, I have a table of actual counts and then a set of claimed proportions.
# And that’s what I need in order to run my Chi-squared Goodness-of-Fit test.
# So the function for that is called chisq.test(). And the first argument it 
## [1] 4.294309
# takes is just my table of observed counts. And then I need to give it the
# option of “p”" equals my vector of claimed proportions.
## [1] 2
chisq.test(gtab, p = claimp)

# If we run this line, we’re going to see the test output in our console window. It gives us that chi-squared
# value, or test statistic, 4.14, our degrees of freedom, which is the number of categories minus 1– so that’s
# just 1– and also our p-value. So we see here our p-value is less the 0.05, and that allows us to reject the null
# hypothesis that these were, in fact, the actual proportions of Grammy winners and non-Grammy winners.
## [1] 153.75
# Now, one thing we have to also make sure to do is to check our assumption of sufficient sample size. And
# we can do this within our chisq.test() function. If we ask for “$expected” at the end of that function,
## [1] 30.75
# we’re basically saying, go into this test and give us this vector, called expected, which is a component of our
# chi-squared test that we just don’t see in the output.
chisq.test(gtab, p = claimp)$expected
## [1] 20.5
# If we run that, we’re going to get the expected counts for each category. And basically, what R is doing is
# taking our claim proportions and saying, if this were true, we would expect 77.3 artists to not have won a
# Grammy and 38.6 to have won a Grammy.
# Because both of these are over 5, we meet our sufficient sample size assumption, so we’re OK to carry out the
# Chi-squared Goodness-of-Fit test. And it also allows us to see that we actually had more Grammy winners
# than our expected. There were actually 49 artists who have won a Grammy, when we were expecting there’d
# only be 38.6.

#############Pre-Lab 4: Austin City Limits##########################
## [1] 5.99
#Primary Research Questions
## [1] 5.991465
#1. Are there an equal number of male and female performers on Austin City Limits?
#2. Are male performers just as likely to have had a Top 10 hit as female performers?
## [1] 4.29
#1a. In what year did Allen Toussaint play at Austin City Limits?
subset(acl, subset = Artist == "Allen Toussaint")
#2009
##   ob1 ob2 ob3
## 1 152  39  14
#1b. How many years old was Allen Toussaint when he performed?
#75

#1c. How many variables for Allen Toussaint have missing data?
sum(is.na(subset(acl, subset = Artist == "Allen Toussaint")))
## [1] 153.75  30.75  20.50
#Check the Variables of Interest
## 
##  Chi-squared test for given probabilities
## 
## data:  observations
## X-squared = 4.2943, df = 2, p-value = 0.1168
#Let's find the variables we need to answer the question.

#2a. Which variable tells us whether the performer is male or female?
## [1] 0.1168
names(acl)
## [1] FALSE
#The variable name in the dataset is  Gender , which is a  Categorical variable.
## [1] TRUE
#2b. Which variable tells us whether the artist has had a Top 10 hit?
#The variable name in the dataset is  BB.wk.top10, which is a Categorical variable.

#Reflect on the Method

#Which method should we be using for this analysis and why?

#3a. We will use a Chi Square Goodness of Fit test to check whether
#there were an equal number of male and female performers. Why?
#ANSWER: We want to see if the distribution of a categorical variable matches a proposed distribution model.

#3b. We will use a Chi Square Test of Independence to determine if
#male and female performers were equally likely to have had a Top 10 hit. Why?
#ANSWER: We want to determine if there is an association between two categorical variables. 

# Breakdown Your Analysis
# 
# Let's break this analysis into its required steps:
# 
# Goodness of Fit Test:
# 1. Make a table of counts for gender.
# 2. Create a vector of the expected proportions.
##       dominant_hand
## gender  L  R Sum
##    F    3  8  11
##    M    2  8  10
##    Sum  5 16  21
# 3. Check the expected counts assumption.
# 4. Run the chi square test.
# 5. Interpret the chi square statistic and p-value.
## Warning in chisq.test(gender_hand_table, correct = FALSE): Chi-squared
## approximation may be incorrect
##       dominant_hand
## gender        L        R
##      F 2.619048 8.380952
##      M 2.380952 7.619048
# 
# Test of Independence:
# 1. Create a two-way table for gender and Top 10 hits.
## Warning in chisq.test(gender_hand_table, correct = FALSE): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  gender_hand_table
## X-squared = 0.15273, df = 1, p-value = 0.6959
# 2. Check the expected counts assumption.
# 3. Run the chi square test.
# 4. Interpret the chi square statistic and p-value.
# 
# Here is the code you will use:
# 
############# Week 4 Prelab: Question 1 (Goodness of Fit)##############
#1. Are there an equal number of male and female performers on Austin City Limits?
# Create a table of counts for Gender
gender_tab <-table(acl$Gender)
gender_tab

# Create vector of expected proportions
ExpGender <- c(.50, .50)
## [1] 3.841459
# Check expected counts assumption
chisq.test(gender_tab, p=ExpGender)$expected
## [1] 3.84
# Run goodness of fit
chisq.test(gender_tab, p=ExpGender)
## [1] 2.38
##########Week 4 Prelab Question 2 (Test of Independence)##############
#2. Are male performers just as likely to have had a Top 10 hit as female performers?
# Create two-way table
## [1] 7.62
gender_top10 <-table(acl$Gender, acl$BB.wk.top10)
gender_top10

# Generate expected counts
chisq.test(gender_top10, correct=FALSE)$expected
## [1] 2.62
# Run test of independence
chisq.test(gender_top10, correct=FALSE)
## [1] 8.38
#1. If we wanted to test the hypothesis that
#the performers were 30% female and 70% male,
#what would the code look like? (Note that
#categorical values are referenced in alphabetical
#order).
ExpGender <- c(.30,.70) 

#2. Suppose the following values were returned for the
#"check expected counts" assumption in our goodness of fit test.
#Would the assumption be violated?
#Yes, because there are fewer than 5 expected Females. correct

#3. Which line of code is not necessary for a test of independence because there is no particular distribution model being tested?
ExpGender_top10 <- c(.25, .25, .25, .25) 

#4. How many degrees of freedom should there be for our test of independence? Remember, performers have either had (or not had) a Top 10 hit.
#1 degree of freedom

# Suppose we wanted to test whether there was an even distribution
#among the seasons. What caused the following error below?
#(You may want to examine the dataset in R for help.)
# 
#acl <- AustinCityLimits
##          internet_access no_access
## rural                 13        15
## suburban              35         7
## urban                 50         3
season_counts <- table(acl$Season)
##          internet_access no_access Sum
## rural                 13        15  28
## suburban              35         7  42
## urban                 50         3  53
## Sum                   98        25 123
ExpSeason <- c(1/4, 1/4, 1/4, 1/4)
##          internet_access no_access   Sum
## rural              0.106     0.122 0.228
## suburban           0.285     0.057 0.341
## urban              0.407     0.024 0.431
## Sum                0.797     0.203 1.000
#chisq.test(season_counts, p=ExpSeason)
# 
# Error in chisq.test(season_counts, p = ExpSeason) : 'x' and 'p' must have the same number of elements
##          internet_access no_access   Sum
## rural              0.106     0.122 0.228
## suburban           0.285     0.057 0.341
## urban              0.407     0.024 0.431
## Sum                0.797     0.203 1.000
#ANSWER: There are not 4 seasons in our dataset, although line 3 suggests that there are.
##          internet_access no_access
## rural           22.30894  5.691057
## suburban        33.46341  8.536585
## urban           42.22764 10.772358
#Conduct the Analysis
## 
##  Pearson's Chi-squared test
## 
## data:  respondents
## X-squared = 26.497, df = 2, p-value = 1.763e-06
# Goodness of Fit Test
# 
# Use the output of your analysis to answer the following questions.
# 
# 1a. There were ________ male and 35 female artists.
gender_tab
## [1] 0.797
#1b. The expected counts were ________ artists of each gender.
## [1] 0.797
chisq.test(gender_tab, p=ExpGender)$expected
##          internet_access  no_access        Sum
## rural         0.10569106 0.12195122 0.22764228
## suburban      0.28455285 0.05691057 0.34146341
## urban         0.40650407 0.02439024 0.43089431
## Sum           0.79674797 0.20325203 1.00000000
#1c. Chi Square (rounded to 2 decimal places, with df=1)=
chisq.test(gender_tab, p=ExpGender)
## [1] 0.203
round(18.241,2)

#1d. The p-value was _______ 0.05.
#Less than
## [1] 22.31
p_value<-1.946e-05

#1e. We should ________ the hypothesis that there were an equal number of male and female performers at ACL Live.
## [1] 10.77
#Reject H0
p_value <= .05 #reject null hypothesis if true
p_value > .05 #fail to reject null hypothesis if true
## [1] 1.763e-06
## [1] TRUE
#Test of Independence
## [1] FALSE
#Use the output of your analysis to answer the following questions.

#2a. Among the male artists, _______ out of 70 had a Top 10 hit.
gender_top10
addmargins(gender_top10)
32

#2b. Among the female artists, _______ out of 33 had a Top 10 hit.
18

#2c. Based on these values, there must have been _________ males and 2 females with missing values for the Top 10 variable.
sum(is.na(acl$BB.wk.top10))

#2d. Chi Square (rounded to 3 decimal places, df=1)=
chisq.test(gender_top10, correct=FALSE)
round(0.70023,3)

#2e. p-value (rounded to 3 decimal places)=
p_value<-print(round(0.4027,3))

#2f. We should ______ the hypothesis that gender is not
#associated with having a Top 10 hit.
p_value
p_value <= .05 #reject null hypothesis if true
p_value > .05 #fail to reject null hypothesis if true
#Fail to reject


prop.table(gender_top10,1)
addmargins(prop.table(gender_top10,1))
top10 <-table(acl$BB.wk.top10) #create a table of sole top 10 winners
top10 #number of no top 10 hits vs top 10 hits
prop.table(top10) #percentages of no top 10 hits vs top 10 hits

#3. Was the expected counts assumption violated in either of these chi square tests?
#ANSWER: It was not violated in either test. 

#Write Your Conclusion

#Answer the question and support your answer with statistics:

# First we examined whether there were an equal number of male
# and female artists on Austin City Limits. In our sample, there
# were 81  males and 35 females . A chi square goodness of fit test
# showed that this difference was statistically significant
# (chi square= 18.24 df=1, p<.05). There are more  males than
# females on the show. Second, we asked whether male and female 
# artists were equally likely to have had a Top 10 hit.
# Approximately 55% of the  female  artists had a Top 10 hit,
# and 46% of the  male artists had a Top 10 hit. This difference
# was not statistically significant. A chi square test of independence
# found top 10 hits to be independent of gender (chi square= 0.700, df=1,
# p=.403). The assumptions for each test were met.


################################Week 4: Lab############################################


# Known as the “Live Music Capital of the World,” Austin, Texas is
# also home to the longest-running music series in American television
# history, Austin City Limits.  This dataset includes data on a sample
# of musicians that performed live on the PBS television series
# Austin City Limits over the last 10 years.  Data on each artist
# include measures of commercial popularity, such as the number of
# social media followers on Twitter or Facebook, and their success
# in winning a Grammy Music Award.

#Review of Chi Square Tests

# In this lab, you will use Chi Square Tests to answer a
# question of interest. Let's start by remembering why we
# use each Chi Square test.
# 
# 1a. In a Chi Square Goodness of Fit test, a proposed distribution
# model is compared to an observed
#ANSWER: marginal distribution

#1b. Two categorical variables are said to be independent if their conditional distribution matches
#the distribution of expected counts, when the variables are assumed not to be related.

#2a. Are each of the four musical genres equally represented on Austin City Limits?
#Goodness of fit test

#2b. Are some genres more likely to draw a large (100K+) Twitter following than others?
#Test of independence

# Primary Research Questions
# 
# 1. Are each of the four musical genres equally represented on Austin City Limits?   Goodness of fit test
# 2. Are some genres more likely to draw a large (100K+) Twitter following than others?

#Goodness of Fit Test
# Question 1 (Goodness of Fit)
#1. Are each of the four musical genres equally represented on Austin City Limits?
# Create a table of counts for Genre
genre_tab <-table(acl$Genre)
genre_tab

# Create vector of expected proportions
ExpGenre <- c(.25, .25, .25, .25)

# Check expected counts assumption
chisq.test(genre_tab, p=ExpGenre)$expected

# Run goodness of fit
chisq.test(genre_tab, p=ExpGenre)


#1a. What was the expected count of artists for each genre?
chisq.test(genre_tab, p=ExpGenre)$expected

#1b. What was the Chi-square statistic? (round to 2 decimal places)
chisq.test(genre_tab, p=ExpGenre)
round(70.414, 2)

#1c. How many degrees of freedom?
3
#1d. What was the p-value?
p_value<-print(3.481e-15)
p_value <= .05 #reject null hypothesis if true
p_value > .05 #fail to reject null hypothesis if true
#reject null hypothesis

#Test of Independence

#Question 2 (Test of Independence)
#2. Are some genres more likely to draw a large (100K+) Twitter following than others?
# Create two-way table
genre_twitter <-table(acl$Genre, acl$Twitter.100k)
genre_twitter

# Generate expected counts
chisq.test(genre_twitter, correct=FALSE)$expected

# Run test of independence
chisq.test(genre_twitter, correct=FALSE)

#2a. Using the data from your two-way table, 
#compute the proportion of artists in each genre with 100K+ Twitter followers.
#(Round to 3 decimal places).
genre_twitter
addmargins(genre_twitter)
prop.table(genre_twitter,1)
## [1] 1.559855e-13
round(addmargins(prop.table(genre_twitter,1)),3)

#Country?
0.353
#Jazz?
0.182
#Rock?
0.441
#Singer-Songwriter?
0.625

#2b. What was the Chi-square statistic? (round to 2 decimal places)
round(5.6919,2)

#2c. How many degrees of freedom?
3

#2d. What was the p-value?
p_value<-print(0.1276)
p_value <= .05 #reject null hypothesis if true
p_value > .05 #fail to reject null hypothesis if true
#Fail to reject

#Conclusions

#3a. We should reject the hypothesis that each genre is equally represented at ACL Live.

#3b. We should  fail to reject the hypothesis that genre is independent of Twitter followers.

#Write your Conclusion

#Answer the question and support your answer with statistics:

# First we examined whether genres were represented equally at
# Austin City Limits. In our sample, there were 18 country,
# 13 jazz/blues, 68 rock/folk/indie, and 17 singer/songwriter acts.
# A Chi-square cgoodness of fit test showed that this difference was
# statistically significant. (Chi-square= 70.41 , df= 3 ,
# p<0.05). There is a higher representation of rock/folk/indie artists
# than other artists on the show.
# 
# Second, we asked whether some genres were likely to draw
# more Twitter (over 100K). A Chi-square test of independence revealed
# that there was no significant finding--a large Twitter
# following was independent of genre (Chi-square= 5.69 , df= 3 ,
# p-value=0.1276).
##   values    ind
## 1    3.2 group1
## 2    3.5 group1
## 3    2.7 group1
## 4    4.1 group1
## 5    3.1 group1
## 6    3.7 group1
################################Week 4: Problem Set############################################

# Question 1
# 
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## ind          3  6.166  2.0553   8.465 0.000368 ***
## Residuals   28  6.799  0.2428                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# You want to know if the proportion of female performers on
# Austin City Limits Live has changed in the past two years. 
# 
## [1] 32
# 1. Create a new variable in the dataset called "Recent" that is equal to a 1 for rows from years 2012 or 2013 and is equal to 0 for all other rows.
## [1] 4.028125
# 
# 2. Make a table that shows the number of male and female performers in "recent" and non-recent years.
# 
# 3. Use this data to answer the following questions.
# 
# Use the "AustinCityLimits.csv" dataset to answer the following questions.  Instructions for installing "AustinCityLimits.csv" can be found under the Examine the Data unit in this week's Pre-Lab section.
# 
# You'll need to use the following code to help:
acl$Recent[acl$Year < 2012] <- 0 
acl$Recent[acl$Year >= 2012] <- 1

recent_gender_tab <-table(acl$Gender, acl$Recent)
recent_gender_tab
addmargins(recent_gender_tab)
##  [1] -0.828125 -0.528125 -1.328125  0.071875 -0.928125 -0.328125  0.171875
##  [8] -0.428125  0.171875 -0.328125 -0.628125  0.271875 -0.128125  0.071875
## [15] -0.928125  0.471875  1.371875  0.571875 -0.028125  1.271875  0.671875
## [22]  0.171875  0.871875  0.671875  0.471875 -0.228125  0.071875 -0.928125
## [29]  0.171875 -0.628125  0.171875  0.471875
## [1] 12.96469
# Generate expected counts
chisq.test(recent_gender_tab, correct=FALSE)$expected

# Run test of independence
chisq.test(recent_gender_tab, correct=FALSE)

#1a. How many female performers have been on the show in
## [1] 6.165937
#the past two years (2012 and 2013)?
#ANSWER: 12

#1b. What is the appropriate method to test if representation
#by female performers is different before 2012 compared to since
## [1] 6.79875
#2012?
#Chi-Square Test of Independence

#1c. Report expected counts for the following performer groups.
#Females before 2012
#Answer: 26.55172

#Females in 2012 and 2013
#Answer: 8.45

#Males before 2012
#Answer: 61.45
## [1] 31
## [1] 3
#Males in 2012 and 2013
## [1] 28
#Answer: 19.55

#1d. What is the Chi Square statistic? (Round to 2 decimal places.)
round(2.8188,2)

#1e. What is the p-value for the test? (Round to 2 decimal places.)
round(0.09317,2)
## [1] 2.055312
#1f. What is the appropriate conclusion for this test, assuming α = 0.05?
p_value<-print(.09317)
## [1] 0.2428125
p_value <= .05 #reject null hypothesis if true
p_value > .05 #fail to reject null hypothesis if true
#Fail to reject
#We fail to reject the null hypothesis; gender is independent of performance before or after 2012.
## [1] 8.464607
#rm(list = ls())  # Clean up
## [1] 2.946685
#Question 2

# When crossing white and yellow summer squash, a genetic model
#predicts that 75% of resulting offspring will be white, 15% will
#be yellow and 10% will be green. 
# 
# Below are the results from an experiment run on a random
#sample of 205 squash offspring.

ob1<-152
ob2<-39
ob3<-14
obtotal<- ob1 + ob2 + ob3 #make sure counts add up to sample size
observations<-data.frame(cbind(ob1, ob2, ob3))
observations

#To identify the expected values, multiply the
#expected % by the total number observed:
#75% of resulting offspring will be white, 15% will
#be yellow and 10% will be green.
##   values    ind
## 1      1 group1
## 2      4 group1
## 3      3 group1
## 4      2 group1
## 5      5 group1
## 6      1 group1
exp1<-.75 * obtotal
exp2<-.15 * obtotal
exp3<-.10 * obtotal
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## ind          4 150.50   37.63   11.19 2.05e-05 ***
## Residuals   26  87.43    3.36                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
expected<-data.frame(cbind(exp1, exp2, exp3))
expected
## [1] 2.742594
#To calculate our chi-square statistic,
#we need to sum the squared difference between
#each observed and expected value divided by the
#expected value:
## [1] 31
chiSquare<-((ob1-exp1)*(ob1-exp1)/exp1) +
## [1] 6.258065
  ((ob2-exp2)*(ob2-exp2)/exp2) +
  ((ob3-exp3)*(ob3-exp3)/exp3)
chiSquare

degreesFreedom<-print(3-1) #how many groups/levels - 1
## [1] 7
## [1] 6
#2a. Which method should we use to test if these data are consistent with the ratio of offspring
## [1] 8
#colors predicted by the genetic model?
## [1] 5
#ANSWER: Chi Square Goodness of Fit Test correct
## [1] 5
#2b. What is the expected count of white offspring? (Round to 2 decimal places.)
exp1

#2c. What is the expected count of yellow offspring? (Round to 2 decimal places.)
exp2

#2d. What is the expected count of green offspring? (Round to 2 decimal places.)
exp3

#2e. Are the expected count conditions met?
##  [1] -5.2580645 -2.2580645 -3.2580645 -4.2580645 -1.2580645 -5.2580645
##  [7] -0.2580645  1.7419355 -0.2580645  0.7419355 -2.2580645 -3.2580645
## [13] -1.2580645  0.7419355 -0.2580645 -2.2580645  2.7419355  1.7419355
## [19] -1.2580645  0.7419355 -1.2580645  2.7419355  3.7419355  1.7419355
## [25] -0.2580645 -1.2580645  3.7419355  5.7419355  2.7419355  4.7419355
## [31]  1.7419355
#YES
## [1] 237.9355
#2f. What are the degrees of freedom and the critical value for this test, assuming α = 0.05?
#degreesFreedom = 2

#Critical Value for alpha=.05: (Round to 2 decimal places.)
#browseURL("https://people.richland.edu/james/lecture/m170/tbl-chi.html")
critical_value<-5.991
round(critical_value,2)
qchisq(.05, df = degreesFreedom, lower.tail = FALSE) #R method: we always have a positive critical value
## [1] 130.4786
#2g. What is the Chi Square statistic for this test? (Round to 2 decimal places.)
round(4.2943,2)

# table of counts
observations
## [1] 107.4569
# Create vector of expected proportions
ExpSquash <- c(.75, .15, .10)

# Check expected counts assumption
chisq.test(observations, p=ExpSquash)$expected

# Run goodness of fit
chisq.test(observations, p=ExpSquash)

#2h. What is the appropriate outcome for this hypothesis test?
p_value<-print(0.1168)
p_value <= .05 #reject null hypothesis if true
## [1] 30
p_value > .05 #fail to reject null hypothesis if true
## [1] 4
#ANSWER: Fail to reject null hypothesis --H0
## [1] 26
#rm(list = ls())  # Clean up

#Question 3

#Approximately 13% of the world's population is left-handed, but is this proportion
#the same across men and women?
## [1] 32.61966
#To answer this question, you decide to collect data from a random sample
## [1] 4.132956
#of adults from your neighborhood, with the following results:

gender<-c("M","M","F","M", "F", "F", "F", #1-7
  "M", "F", "F", "M", "F", "M", "M", "F", #8-15
  "M", "M","F", "F", "M", "F") #16-21
## [1] 7.892572
dominant_hand<-c("L", "R", "R", "R", "R", "L", "L", #1-7
  "R", "R", "R", "L", "R", "R", "R", "R", #8-15
## [1] 5.763466
  "R", "R", "R", "L", "R", "R") #16-21
## [1] TRUE
gender_hand<-data.frame(cbind(gender, dominant_hand))
gender_hand_table<-table(gender_hand)
addmargins(gender_hand_table)

# Generate expected counts
chisq.test(gender_hand_table, correct=FALSE)$expected

# Run test of independence
chisq.test(gender_hand_table, correct=FALSE)


# H0 : The population distribution of the left-handed individuals is the
# same as the proposed distribution--13% male, 13% female
# HA : The distributions are different--(not 13% male, 13% female)


#3a. Which of these is the appropriate null hypothesis for this test?
#ANSWER: Gender and hand-dominance are independent. 

#3b. What would be the estimated degrees of freedom and the critical value for this analysis, assuming α = 0.05?

degreesFreedom<-1
qchisq(.05, df = degreesFreedom, lower.tail = FALSE) #R method: we always have a positive critical value
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## ind          4  351.5   87.88   9.085 1.82e-05 ***
## Residuals   45  435.3    9.67                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Critical Value(Round to 2 decimal places.)
## [1] 2.742594
round(3.841459,2)
## [1] 7
#3c. What are the expected counts for Males? (Round to 2 decimal places.)
## [1] 6.9
## [1] 11
#Left-handed Males
## [1] 13.4
round(2.380952,2)
## [1] 12
#Right-handed Males(Round to 2 decimal places.)
## [1] 50
round(7.619048,2)
## [1] 10.06
#3d. What are the expected counts for Females? (Round to 2 decimal places.)

#Left-handed Females
round(2.619048,2)
## [1] 10
## [1] 10
#Right-handed Females(Round to 2 decimal places.)
## [1] 10
round(8.380952,2)
## [1] 10
## [1] 10
#3e. Are all relevant conditions met to run this analysis?
#ANSWER: No!

#rm(list = ls())  # Clean up
##  [1] -1.06 -2.06 -4.06 -2.06 -0.06 -6.06 -4.06 -5.06 -3.06 -3.06 -3.06
## [12] -1.06 -4.06 -4.06 -4.06  0.94 -4.06 -7.06 -2.06 -3.06  0.94  2.94
## [23] -2.06 -4.06  3.94  0.94  2.94  2.94 -0.06  0.94  1.94  0.94  5.94
## [34]  0.94 -1.06 12.94  1.94 -0.06  8.94  0.94 -0.06  8.94  3.94 -5.06
## [45] -0.06  0.94  3.94  4.94  0.94  0.94
## [1] 786.82
#Question 4

# #A telephone survey asked a random sample of Indiana voters
# about their home internet usage, as well as what type of
# community (rural, suburban or urban) they lived in. 
# 
# Of the 123 survey respondents, 28 were from rural areas,
# 42 were from suburban areas, and 53 were from urban areas.
# Thirteen rural respondents, 35 suburban respondents, and
## [1] 351.52
# 50 urban respondents said they had access to internet at
# home. 

respondents<-matrix(c(13,15,35,7,50,3), ncol = 2, byrow = TRUE)
colnames(respondents) <- c("internet_access", "no_access")
rownames(respondents) <- c("rural", "suburban", "urban")
## [1] 435.3
respondents<-as.table(respondents)
respondents
addmargins(respondents)
round(addmargins(prop.table(respondents)),3) #rounded to 3 decimals

# Generate expected counts
round(addmargins(prop.table(respondents)),3)
chisq.test(respondents, correct=FALSE)$expected

# Run test of independence
chisq.test(respondents, correct=FALSE)

#4a. What is the appropriate null hypothesis for this test?
## [1] 49
#Test of independence
## [1] 4
#ANSWER: Home internet access and community type are independent. 
## [1] 45
#4b. What proportion of respondents had internet access at home? (Round to 3 decimal places.)
round(98/123,3)
round(0.79674797,3)
addmargins(prop.table(respondents))

#4c. What proportion of respondents did NOT have internet access at home? (Round to 3 decimal places.)
## [1] 87.88
round(25/123,3)
## [1] 9.673333
#4d. How many rural residents would we expect to have home internet?
#Please use a rounded proportion you calculated above. (Round to 2 decimal places.)
round(22.30894,2)

#4e. How many urban residents would we expect NOT to have home internet? Please use a rounded proportion you calculated above. (Round to 2 decimal places.)
## [1] 9.084769
round(10.772358,2)

#4f. Does this data provide sufficient evidence that internet access at home depends on what type of community the Indiana voters live in?
## [1] 5.707289
p_value<-print(1.763e-06)
p_value <= .05 #reject null hypothesis if true
## [1] TRUE
p_value > .05 #fail to reject null hypothesis if true
#Reject null hypothesis
#H0: Home internet access and community type are independent. 
#HA:Home internet access and community type are not independent.
#ANSWER: YES!

##############Week 5:  Hypothesis Testing (More Than Two Group Means)  ##########################
#http://homepages.inf.ed.ac.uk/bwebb/statistics/ANOVA_in_R.pdf

# Define analysis of variance (ANOVA) as a statistical
# inference method that is used to determine - by
# simultaneously considering many groups at once -
# if the variability in the sample means is so large
# that it seems unlikely to be from chance alone.

# Recognize that the null hypothesis in ANOVA sets all
# means equal to each other, and the alternative hypothesis
# suggest that at least one mean is different.

# The Null and Alternative hypotheses for a one-way ANOVA can be written as:
# H0 : Means of all factor levels are equal
# HA : At least one factor level has a different mean
# The ANOVA can be used when we want to test the means of three or more populations at once. 

#alternatively:
## [1] 4
# H0:μ1=μ2=⋯=μk
# HA: At least one mean is different

# When we conduct a hypothesis test, we are testing the probability of obtaining an extreme F-statistic
## [1] 99
# by chance.

#The conditions necessary for performing ANOVA

# 1. the observations should be independent within and across
## [1] 14.25
# groups
# 2. the data within each group are nearly normal
## [1] 60
# 3.the variability across the groups is about equal
## [1] 0.625
# and use graphical diagnostics to check if these conditions
## [1] 22.8
# are met.

# The test statistic for ANOVA, the F statistic,
# is calculated as the ratio of the mean square between
# groups (MSG, variability between groups) and mean
# square error (MSE, variability within errors). Also
# recognize that the F statistic has a right skewed
# distribution with two different measures of degrees of
# freedom: 
#   1. one for the numerator (dfG=k−1, where k is the number
#   of groups)
#   2. one for the denominator (dfE=n−k, where n is the
#     total sample size).

#The calculation of the p-value for ANOVA is always
## [1] TRUE
#“one sided”.

# Conducting many t-tests for differences between each pair
# #of means leads to an increased Type 1 Error rate, and we
# use a corrected significance level (Bonferroni correction,
# α⋆=α/K, where K is the number of comparisons being
# considered) to combat inflating this error rate.
## [1] 6
#it is possible to reject the null hypothesis in ANOVA but
#not find significant differences between groups when doing
#pairwise comparisons.

# Construct bootstrap confidence intervals using one of
# the following methods:
# 
# 1. Percentile method: XX% confidence level is the middle
# XX% of the bootstrap distribution.
# 2. Standard error method: If the standard error of the
# bootstrap distribution is known, and the distribution
# is nearly normal, the bootstrap interval can also be
# calculated as point estimate ± t ⋆ SEboot.

#When the bootstrap distribution is extremely skewed and
#sparse, the bootstrap confidence interval may not be
#reliable.

#Example

# Is there a difference between the average vocabulary scores
# of Americans from different (Self reported) classes?
# 
# Compare means of 2 groups using a T statistic
# Compare means of 3+groups using analysis of variance
# ANOVA and anew statistic called F.
## [1] 0.008333333
# H0: The average vocabulary score is the same across all
# social classes
# μ1=μ2=μ3=μ4
# 
# HA:The average vocabulary scores differ between at least
# 1 pair of social classes.

#The numerical variable is the response variable in ANOVA.
## [1] 0.265
# sum of squares total (SST):
# Measures the total variability in the response vaiable
# Calculated very similarly to variance (Except not
#   scaled by the sample size)
#Calculated by the squared deviation of the response variable


# sum of squares groups (SSG)
# measures the variablity between groups
# Explained variability: squared deviation of group means
# from overall mean, weighted sample size


# sum of squares error(SSE)
##    Impound.No Intake.Date Intake.Type Animal.Type Neutered.Status    Sex
## 1: K12-000031      1/1/12       Stray         Dog          Spayed Female
## 2: K12-000037      1/1/12       Stray         Dog          Intact Female
## 3: K12-000108      1/1/12       Stray         Dog          Intact   Male
## 4: K12-000125      1/1/12       Stray         Dog        Neutered   Male
## 5: K12-000157      1/1/12       Stray         Dog        Neutered   Male
## 6: K12-000286      1/1/12       Stray         Dog        Neutered   Male
##    Age.Intake       Condition                         Breed Aggressive
## 1:         10 Injured or Sick              Chihuahua Sh Mix          N
## 2:          3          Normal               Rat Terrier Mix          N
## 3:          2          Normal                  Pit Bull Mix          N
## 4:          0          Normal Labrador Retr & Border Collie          N
## 5:          3 Injured or Sick                Labrador Retr           N
## 6:          5          Normal               Yorkshire Terr           N
##    Independent Intelligent Loyal Social Good.with.Kids Max.Life.Expectancy
## 1:           N           Y     N      N              N                  18
## 2:           N           Y     N      Y              Y                  14
## 3:           N           N     Y      N              Y                  14
## 4:           Y           Y     Y      Y              Y                  12
## 5:           Y           Y     Y      Y              Y                  12
## 6:           N           N     Y      N              Y                  15
##    Max.Weight Dog.Group         Color Weight           Lab.Test
## 1:          6       Toy   Tan & White    3.3 Heartworm Negative
## 2:         25   Terrier White & Brown    7.5        No Lab Test
## 3:         90   Terrier  Blue & White   74.0 Heartworm Negative
## 4:         79  Sporting White & Black   22.0        No Lab Test
## 5:         79  Sporting Black & White   54.0 Heartworm Negative
## 6:          7   Terrier  Silver & Tan    4.8 Heartworm Negative
##    Outcome.Date      Outcome.Type Days.Shelter
## 1:       1/7/12          Adoption            6
## 2:       1/3/12   Return to Owner            2
## 3:      1/13/12 Humane Euthanasia           12
## 4:       1/8/12          Adoption            7
## 5:       4/4/12          Adoption           94
## 6:      1/10/12   Return to Owner            9
# measures the variability within groups
# unexplained variability: unexplained by
## 
## Female   Male 
##    220    253
# the group variable, due to other reasons
  #SSE=SST-SSG

# Degrees of Freedom associated with ANOVA:

#   total: dfT = n-1 (sample size minus 1)
#   group: dfG = k -1 (number of groups minus 1)

#   error: dfE = dfT - dfG (the different between the 2 above)

# MEan squares: Average variability between and within groups,

# calculated as the total variability (sum of squares) scaled
# by the associated degrees of freedom.
#   group: MSG = SSG/dfG
#   error: MSE = SSE/dfE

# F Statistic: Ratio of the average between group and
# within group variabilities:
#   F = MSG/MSE

# p-value is the probability of at least as large a ratio
# between the "between" and "within" group variabilities
# if in fact the means of all groups are equal

# area under the F curve, with degrees of freedom dfG
# and dfE, above the observed F statistic
## [1]  56 115 140 158 280
#using R-pf funcion

f_value<-21.735
n<-3 #number of groups
degreesFreedom<-791

pf(f_value, n, degreesFreedom, lower.tail = FALSE  )

#Conclusion

# If p-value is small (less than alpha), reject H0
#   the data provide convincing evidence that at least
#   one pair of population means are different from each
#   other (but we can't tell which one)

#If p-value is large, fail to reject H0.

  # The data do not provide convincing evidence that at least
  # one pair of population means are different from each other,
  # the observed differences in sample means are attributable
  # to sampling variability (or chance).

#We set-up our anova in a general way: dependent ~ explanatory1... explanatory2...

#Conditions for ANOVA
# 1. Independence
#   Within groups: sampled observations must be independent

#          --random sample assignment (observationsl study vs. experiment)
#         -- each nj less tan 10% of respective population
#          --always important but sometimes difficult to check
#   Between groups: the groups must be independent of each other
#   (non-paired)
      # --Carefully consider whether the groups may be dependent
      # --repeated measures ANOVA
# 2. Approximate normality: Distributions should be nearly
# normal within each group
# 3. Equal variance: groups should have roughly equal
# variability

###############Working through ANOVA(Page 297)####################
#Suppose a salesperson wants to compare the level of satisfaction
#of customers for four difference insurance
#companies. Our question is:

#"Is there a difference in satisfaction scores across the four
#difference insurance companies?”

group1<- c(3.2, 3.5, 2.7, 4.1, 3.1, 3.7, 4.2, 3.6)
group2<- c(4.2, 3.7, 3.4, 4.3, 3.9, 4.1, 3.1, 4.5)
group3<- c(5.4, 4.6, 4, 5.3, 4.7, 4.2, 4.9, 4.7)
group4<- c(4.5, 3.8, 4.1, 3.1, 4.2, 3.4, 4.2, 4.5)
all_observ<-c(group1, group2, group3, group4) #create one big list of all the observations
##              Genre     Days
## 1 Action/Adventure 140.0631
## 2        Animation 154.5714
## 3           Comedy 136.0588
## 4            Drama 138.1111
#create a dataframe to do ANOVA
df<-stack(list(group1 = group1, group2 = group2, group3 = group3, group4 = group4))
head(df)

#the R way
anova_analysis <- aov(values ~ ind, data=df)
##        feed   weight
## 1    casein 323.5833
## 2 horsebean 160.2000
## 3   linseed 218.7500
## 4  meatmeal 276.9091
## 5   soybean 246.4286
## 6 sunflower 328.9167
summary(anova_analysis)
##      Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     setosa        5.006       3.428        1.462       0.246
## 2 versicolor        5.936       2.770        4.260       1.326
## 3  virginica        6.588       2.974        5.552       2.026
#the by hand way
total_n<-print(length(all_observ)) #total observations
grand_mean<- print(mean(all_observ)) #the mean of all the observations
n<-8 #observations in each group
k<- 4#the number of groups
n_group1<-length(group1)
##              Genre     Days
## 1 Action/Adventure 38.35202
## 2        Animation 30.79639
## 3           Comedy 27.30492
## 4            Drama 37.45479
n_group2<-length(group2)
n_group3<-length(group3)
n_group4<-length(group4)

sample_mean1<-mean(group1)
sample_mean2<-mean(group2)
sample_mean3<-mean(group3)
sample_mean4<-mean(group4)

#1. Calculate the total sum of squares (SST ).
#SST = ∑(y−y¯2) = ∑y2 −(∑y)/n
sum_of_squares<-print(all_observ-grand_mean)
sst<-print(sum(sum_of_squares^2))

#2. Calculate the sum of squares between (SSB). the sum of squares (deviations) of the group means from the grand mean,
#where X .. represents the grand mean.
#SSB = ∑nk (y¯k −y¯)2
ssb<-n*(sample_mean1 - grand_mean)^2 + n*(sample_mean2 - grand_mean)^2 +
  n*(sample_mean3 - grand_mean)^2 + n*(sample_mean4 - grand_mean)^2
ssb
##              Df Sum Sq Mean Sq F value Pr(>F)
## film$Genre    3   3162    1054   0.785  0.504
## Residuals   147 197278    1342
#3. Find the sum of squares within groups (SSW) by subtracting:
#SSW = SST − SSB
ssw <- sst - ssb
## [1] FALSE
ssw
## [1] TRUE
#4. Next solve for degrees of freedom for the test:
# d fTotal = N −1 
# d fBetween = k −1 # The df BETWEEN is calculated by subtracting 1 from the number of groups you have.
# d fWithin = N − k

df_group1<-n_group1 - 1
df_group2<-n_group2 - 1
df_group3<-n_group3 - 1
df_group4<-n_group4 - 1
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = film$Days ~ film$Genre)
## 
## $`film$Genre`
##                                  diff       lwr      upr     p adj
## Animation-Action/Adventure  14.508366 -12.49138 41.50811 0.5036180
## Comedy-Action/Adventure     -4.004240 -28.79839 20.78991 0.9750409
## Drama-Action/Adventure      -1.951952 -34.94619 31.04229 0.9987000
## Comedy-Animation           -18.512605 -52.87019 15.84498 0.5012353
## Drama-Animation            -16.460317 -57.13357 24.21293 0.7193063
## Drama-Comedy                 2.052288 -37.19152 41.29610 0.9991003
dfTotal<- print(total_n-1)

dfBetween<- print(4 - 1)
dfWithin<- print(df_group1 + df_group2 + df_group3 + df_group4)


#5. Using the values, you can now calculate the Mean Squares Between (MSB) and Mean Squares Within
#(MSW ) using the relationships below:
#MSB = SSB/dfBetween
#MSW = SSW/dfWithin

msb<- print(ssb/dfBetween)

msw<- print(ssw/dfWithin)

#6. Finally, calculate the F statistic using the following ratio
#F = MSB/MSW

f_statistic<-print(msb/msw)

#alpha + degress of freedom within + degrees of freedom between
criticalValue<-print(qf(.05, df1 = 3, df2 = 28, lower.tail = FALSE)) #R method: we always have a positive critical value
## Observations: 151
## Variables: 15
## $ Rank      <int> 1, 2, 4, 8, 9, 13, 15, 16, 17, 18, 20, 22, 23, 27, 2...
## $ Film      <chr> "Avatar", "Titanic", "Harry Potter and the Deathly H...
## $ Studio    <chr> "Fox", "Par.", "WB", "Sony", "WB", "Fox", "WB", "WB"...
## $ Genre     <chr> "Action/Adventure", "Drama", "Action/Adventure", "Ac...
## $ Year      <int> 2009, 1997, 2011, 2012, 2012, 1999, 2012, 2008, 2001...
## $ Gross     <S3: integer64> 1.374639e-314, 1.079731e-314, 6.627891e-31...
## $ Gross.Dom <int> 760500000, 658700000, 381000000, 304400000, 44810000...
## $ Pct.Dom   <dbl> 0.273, 0.301, 0.284, 0.275, 0.413, 0.462, 0.298, 0.5...
## $ Gross.Ovr <int> 2021800000, 1526700000, 960500000, 804200000, 636300...
## $ Pct.Ovr   <dbl> 0.727, 0.699, 0.716, 0.725, 0.587, 0.538, 0.702, 0.4...
## $ Rotten    <int> 83, 88, 96, 92, 87, 57, 65, 94, 80, 92, 78, 79, 84, ...
## $ IMDB      <dbl> 8.0, 7.6, 8.1, 7.8, 8.6, 6.5, 8.1, 9.0, 7.3, 8.0, 7....
## $ Rating    <chr> "PG13", "PG13", "PG13", "PG13", "PG13", "PG", "PG13"...
## $ Days      <int> 238, 219, 133, 122, 147, 261, 133, 231, 162, 147, 14...
## $ Budget    <dbl> 237.0, 200.0, 125.0, 200.0, 250.0, 115.0, 180.0, 185...
#Interpret the results of the hypothesis test.
##   Rank    Film Studio Genre Year         Gross Gross.Dom Pct.Dom
## 1    2 Titanic   Par. Drama 1997 1.079731e-314 658700000   0.301
##    Gross.Ovr Pct.Ovr Rotten IMDB Rating Days Budget
## 1 1526700000   0.699     88  7.6   PG13  219    200
# In our example of the insurance companies,
# our F-value from the ANOVA test is greater than the F-critical value, so we would reject our Null Hypothesis. We
# can conclude that the average customer satisfaction scores of the four insurance companies are not equal to one
# another – at least one of them is different from the others.
##   Studio
## 1    Fox
## 2   Par.
## 3     WB
## 4   Sony
## 5   Uni.
#############Apply our Knowledge Example A (Page 299)##############################
group1<- c(1,4,3,2,5,1,6)
group2<- c(8,6,7,4,3,5)
group3<- c(7,6,4,9,8,5,7,5)
group4<- c(9,10,8,6,5)
group5<- c(10,12,9,11,8)
##   Rank          Film Studio            Genre Year         Gross Gross.Dom
## 1   18 Jurassic Park   Uni. Action/Adventure 1993 4.791943e-315 402500000
##   Pct.Dom Gross.Ovr Pct.Ovr Rotten IMDB Rating Days Budget
## 1   0.415 567400000   0.585     92    8   PG13  147     63
all_observ<-c(group1, group2, group3, group4, group5) #create one big list

#create data frame
df<-stack(list(group1=group1, group2=group2, group3= group3,
  group4 = group4, group5 = group5))
head(df)

#the R way
anova_analysis <- aov(values ~ ind, data=df)
summary(anova_analysis)
##    Rank                                        Film Studio
## 1    13   Star Wars: Episode I - The Phantom Menace    Fox
## 2    17       Harry Potter and the Sorcerer's Stone     WB
## 3     2                                     Titanic   Par.
## 4     8                                     Skyfall   Sony
## 5     1                                      Avatar    Fox
## 6    18                               Jurassic Park   Uni.
## 7     4 Harry Potter and the Deathly Hallows Part 2     WB
## 8    15           The Hobbit: An Unexpected Journey     WB
## 9     9                       The Dark Knight Rises     WB
## 10   16                             The Dark Knight     WB
##               Genre Year         Gross Gross.Dom Pct.Dom  Gross.Ovr
## 1  Action/Adventure 1999 5.074054e-315 474500000   0.462  552500000
## 2  Action/Adventure 2001 4.816152e-315 317600000   0.326  657200000
## 3             Drama 1997 1.079731e-314 658700000   0.301 1526700000
## 4  Action/Adventure 2012 5.477212e-315 304400000   0.275  804200000
## 5  Action/Adventure 2009 1.374639e-314 760500000   0.273 2021800000
## 6  Action/Adventure 1993 4.791943e-315 402500000   0.415  567400000
## 7  Action/Adventure 2011 6.627891e-315 381000000   0.284  960500000
## 8  Action/Adventure 2012 5.024648e-315 303000000   0.298  714000000
## 9  Action/Adventure 2012 5.357648e-315 448100000   0.413  636300000
## 10 Action/Adventure 2008 4.963383e-315 534900000   0.532  469700000
##    Pct.Ovr Rotten IMDB Rating Days Budget
## 1    0.538     57  6.5     PG  261    115
## 2    0.674     80  7.3     PG  162    125
## 3    0.699     88  7.6   PG13  219    200
## 4    0.725     92  7.8   PG13  122    200
## 5    0.727     83  8.0   PG13  238    237
## 6    0.585     92  8.0   PG13  147     63
## 7    0.716     96  8.1   PG13  133    125
## 8    0.702     65  8.1   PG13  133    180
## 9    0.587     87  8.6   PG13  147    250
## 10   0.468     94  9.0   PG13  231    185
##   Rank                                      Film Studio            Genre
## 1   13 Star Wars: Episode I - The Phantom Menace    Fox Action/Adventure
##   Year         Gross Gross.Dom Pct.Dom Gross.Ovr Pct.Ovr Rotten IMDB
## 1 1999 5.074054e-315 474500000   0.462 552500000   0.538     57  6.5
##   Rating Days Budget
## 1     PG  261    115
criticalValue<-qf(.05, df1 = 4, df2 = 26, lower.tail = FALSE) #R method: we always have a positive critical value
criticalValue

#the by hand way

total_n<-print(length(all_observ)) #total observvations
grand_mean<- print(mean(all_observ)) #the mean of all the observations
n<-5#total number of observations in each group
k<-sqldf('SELECT COUNT (DISTINCT ind) FROM df') #get the num of groups
k<-k[1,] #place number of groups into a vector

n_group1<-print(length(group1))
n_group2<-print(length(group2))
n_group3<-print(length(group3))
n_group4<-print(length(group4))
n_group5<-print(length(group5))

sample_mean1<-mean(group1)
sample_mean2<-mean(group2)
sample_mean3<-mean(group3)
sample_mean4<-mean(group4)
sample_mean5<-mean(group5)

#Step 3: Hypothesis

#1. Calculate the total sum of squares (SST ).
sum_of_squares<-print(all_observ-grand_mean)
sst<-print(sum(sum_of_squares^2))

#2. Calculate the sum of squares between (SSB). the sum of squares (deviations) of the group means from the grand mean,
#where X .. represents the grand mean.
#SSB = ∑nk (y¯k −y¯)2
ssb<-(n*(sample_mean1 - grand_mean)^2) +
  (n*(sample_mean2 - grand_mean)^2) +
  (n*(sample_mean3 - grand_mean)^2) +
  (n*(sample_mean4 - grand_mean)^2) +
  (n*(sample_mean5 - grand_mean)^2)
ssb


#3. Find the sum of squares within groups (SSW ) by subtracting:
#SSW = SST − SSB
ssw <- sst - ssb
ssw

#4. Next solve for degrees of freedom for the test:
# d fTotal = N −1 
# d fBetween = k −1 # The df BETWEEN is calculated by subtracting 1 from the number of groups you have.
# d fWithin = N − k # k = the number of groups

df_group1<-n_group1 - 1
df_group2<-n_group2 - 1
df_group3<-n_group3 - 1
df_group4<-n_group4 - 1
df_group5<-n_group4 - 1

dfTotal<- print(total_n-1)
dfBetween<- print(k - 1) ## The df BETWEEN is calculated by subtracting 1 from the number of groups you have.
dfWithin<- print(df_group1 + df_group2 + df_group3 + df_group4 + df_group5)

#5. Using the values, you can now calculate the Mean Squares Between (MSB) and Mean Squares Within
#(MSW ) using the relationships below:
#MSB = SSB/dfBetween
#MSW = SSW/dfWithin

msb<- print(ssb/dfBetween)

msw<- print(ssw/dfWithin)

#6. Finally, calculate the F statistic using the following ratio
#F = MSB/MSW

f_statistic<-print(msb/msw)

#alpha + degress of freedom within + degrees of freedom between
f_critical<-print(qf(.05, df1 = dfWithin, df2 = dfBetween, lower.tail = FALSE)) #R method: we always have a positive critical value

f_statistic > f_critical #if True reject the H0


#Interpret the results of the hypothesis test.
# Since our calculated F-value is greater than the F-critical, we can reject our Null Hypothesis. We conclude
# the all five reading program means are not equal to one another, but that there is at least one reading method
# score mean that is not like the others.


##############Example: Hand Calculation of ANOVA######################
#http://oak.ucc.nau.edu/rh232/courses/EPS625/Handouts/One-Way%20ANOVA/Hand%20Calculation%20of%20ANOVA.pdf

group1<- c(9,8,6,8,10,4,6,5,7,7)
group2<- c(7,9,6,6,6,11,6,3,8,7)
group3<- c(11,13,8,6,14,11,13,13,10,11)
group4<- c(12,11,16,11,9,23,12,10,19,11)
group5<- c(10,19,14,5,10,11,14,15,11,11)
all_observ<-c(group1, group2, group3, group4, group5) #create one big list

df<-stack(list(group1=group1, group2=group2, group3= group3,
  group4 = group4, group5 = group5))

anova_analysis <- aov(values ~ ind, data=df)
summary(anova_analysis)

criticalValue<-print(qf(.05, df1 = 4, df2 = 26, lower.tail = FALSE)) #R method: we always have a positive critical value

sample_mean1<-print(mean(group1))
sample_mean2<-print(mean(group2))
sample_mean3<-print(mean(group3))
sample_mean4<-print(mean(group4))
sample_mean5<-print(mean(group5))

total_n<-print(length(all_observ)) #total observvations
grand_mean<- print(mean(all_observ))

n<- length(group1)#total number of observations in each group
k<-sqldf('SELECT COUNT (DISTINCT ind) FROM df') #get the num of groups
k<-k[1,] #place number of groups into a vector
n_group1<-print(length(group1))
n_group2<-print(length(group2))
n_group3<-print(length(group3))
n_group4<-print(length(group4))
n_group5<-print(length(group5))

#1. Calculate the total sum of squares (SST ).

sum_of_squares<-print(all_observ-grand_mean)
sst<-print(sum(sum_of_squares^2))

#2. Calculate the sum of squares between (SSB). the sum of squares (deviations) of the group means from the grand mean,
#where X .. represents the grand mean.
#SSB = ∑nk (y¯k −y¯)2
ssb<-(n*(sample_mean1 - grand_mean)^2) +
  (n*(sample_mean2 - grand_mean)^2) +
  (n*(sample_mean3 - grand_mean)^2) +
  (n*(sample_mean4 - grand_mean)^2) +
  (n*(sample_mean5 - grand_mean)^2)
ssb


#3. Find the sum of squares within groups (SSW ) by subtracting:
#SSW = SST − SSB
ssw <- sst - ssb
ssw

#4. Next solve for degrees of freedom for the test:
# d fTotal = N −1 
# d fBetween = k −1 # The df BETWEEN is calculated by subtracting 1 from the number of groups you have.
# d fWithin = N − k # k = the number of groups

df_group1<-n_group1 - 1
df_group2<-n_group2 - 1
df_group3<-n_group3 - 1
df_group4<-n_group4 - 1
df_group5<-n_group4 - 1

dfTotal<- print(total_n-1)
dfBetween<- print(k - 1)
dfWithin<- print(df_group1 + df_group2 + df_group3 + df_group4 + df_group5)

#5. Using the values, you can now calculate the Mean Squares Between (MSB) and Mean Squares Within
#(MSW ) using the relationships below:
#MSB = SSB/dfBetween
#MSW = SSW/dfWithin

msb<- print(ssb/dfBetween)

msw<- print(ssw/dfWithin)

#6. Finally, calculate the F statistic using the following ratio
#F = MSB/MSW

f_statistic<-print(msb/msw)

#alpha + degress of freedom within + degrees of freedom between
f_critical<-print(qf(.05, df1 = dfWithin, df2 = dfBetween, lower.tail = FALSE)) #R method: we always have a positive critical value

f_statistic > f_critical #if True reject the H0



##############Week 5:  Video Lectures Means: One-Way ANOVA ##########################
# 1. ANOVA is an appropriate statistical measure when we want to:
# compare the means of three or more populations at once.  

#2. In ANOVA, we calculate an F statistic. The F statistic is the ratio of:
#the variation between groups to the variation within groups. 

#3. If the null hypothesis for an ANOVA test is μA=μB=μC, what is the appropriate alternative hypothesis?
#At least one of the means is different.


#4. The source table below presents the results from an
#ANOVA comparing four treatment conditions with n=25
#participants in each condition. Compare all the missing
#values. Hint: Start with degrees of freedom. (Round to
#3 decimal places where needed.)

sst<-117
n<-25
msb<-19
f_critical<-2.699
dfBetween<- print(k - 1) #is 3
k<-4 #num of groups
# d fTotal = N −1
total_n<-25*4 #total num of observvations
dfTotal<- print(total_n-1)
dfWithin<-99-3
ssb<-57
ssw<-60

msb<- print(ssb/dfBetween) #is 19
ssw <- sst - ssb
ssw
msw<- print(ssw/dfWithin)
f_statistic<-print(msb/msw)

#4a. SSB=57

#4b. SSW=60

#4c. dfB=3

#4d. dfW=96

#4e. dfT=99

#4g. F-stat=30.4

#4h. We should __________ the null hypothesis.
f_statistic > f_critical #if True reject the H0 
#Answer: Reject


#4i. Assume your F-statistic is significant, suggesting that at
#least one treatment condition is different from the others.
#How many post-hoc group comparisons will you need to run?
#Remember the formula for group comparisons: k(k−1)2
post_hoc_comp<-print((k*(k-1))/2)
#ANSWER: 6


#The Bonferroni correction simply divides the
#significance level we used for the ANOVA of
#0.05 by 6. So our new significance level –
#just for the post-hoc comparisons –is 0.008.

#If we were to do these post-hoc comparisons by
#hand for our example, we would need to run 6
#independent samples t-tests and find the
#critical value for each comparison, based on
#the degrees of freedom for the comparison and
#the new significance level of 0.008.

#When more than one t-test is run, each at its own level of
#significance, the probability of making one or more Type I
#errors multiplies exponentially.

# When testing more than one pair of samples, the
# probability of making at least one Type I error
# is: 1−(1−α)c

#4j. Using the Bonferroni correction, what significance level should
#you use for each post-hoc hypothesis test if you want an overall
#significance level of 0.05?
post_hoc_sign_level<-print(.05/post_hoc_comp) #post_hoc_comp<-print((k*(k-1))/2)-- #k is the number of groups you have
#Answer: .008

#4k. What was the overall risk of making a Type I error in this ANOVA? (Round to 2 decimal places.)
#Answer: .05

#4l. What would the risk of a Type I error have been if you
#had run multiple t-tests instead? Assume α=0.05 for each test.
#(Round to 2 decimal places)
round(1-(1-.05)^6,3)

#4m. How would you interpret this difference in overall Type I error rate?
#The risk of finding a difference between group means, when one
#does not really exist, is 5 times lower when you run an ANOVA.

##############Week 5:  Video Lectures: Two-Way ANOVA ##########################

#########################Optional Review######################
#Visualizing Univariate Data

#import dataset from url
animaldata<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/78cc1365ee7f9798d6dfd02cb35aab74/asset-v1:UTAustinX+UT.7.11x+2T2017+type@asset+block/AnimalData.csv")

head(animaldata)

table(animaldata$Sex)

#creating a bar plot
ggplot(animaldata, aes(x=Sex)) + geom_bar() +
  ggtitle("Bar Chart of Animal Genders")

hist(animaldata$Age.Intake)

ggplot(animaldata, aes(x = Age.Intake)) +
  geom_histogram(binwidth = 2)



##Histograms by Groups

#faceted histogram
ggplot(animaldata, aes(x = Age.Intake)) +
  geom_histogram(binwidth = 1) +
  facet_wrap(~ Sex)

##Boxplots

#import data
film<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/d1c5e00eb9597e12480cd507b93cbb26/asset-v1:UTAustinX+UT.7.21x+2T2017+type@asset+block/FilmData.csv")

fivenum(film$Days)

boxplot(film$Days, main = "Days in Theaters", ylab = "Days", xlab = "All Films")


film %>%
  ggplot(aes(x = 1, y = Days)) +
  geom_boxplot() + coord_flip() + ggtitle("Days in Theaters")

hist(film$Days)
# #we can again see a pretty symmetric distribution. We do have a few cases of
# movies that stayed a lot longer in the theaters than the rest, but there isn’t a huge skew to this distribution,
# and there is not an extreme outlier that would give us trouble in any analysis.

#grouped boxplot
boxplot(film$Days ~ film$Genre)

film %>%
  ggplot(aes(x = Genre, y = Days)) +
  geom_boxplot() + coord_flip() + ggtitle("Days in Theaters")

#######################One-way Anova in R Tutorial#####################
#And we can see here in our grouped boxplot, the distribution of this variable, “Days”, for each “Genre”.
film %>%
  ggplot(aes(x = Genre, y = Days)) +
  geom_boxplot() + coord_flip() + ggtitle("Days in Theaters")

# If we wanted to test the hypothesis, the mean number of days spent in
# theaters is equal across all four genres,
# we could run a one-way ANOVA on this data. Before we get to the ANOVA
# function, I want to show you
# one new function that will help us look at the means of this variable
# across all four groups and also let us
# test one of the assumptions of an ANOVA.


# So this function is called aggregate(). And it basically takes a
# numeric variable as a function of a categorical
# variable and will produce a certain descriptive statistic for every
# single group. The way you enter the first
# argument is just the variable name, “Days”, and then that tilde again,
# and then the categorical variable. Now,
# we’re not going to be using the data frame dollar sign notation just
# because this function doesn’t work that
# way. We actually give it the data frame name in the second argument.
# So this is going to tell the function, go
# into the film data frame, find the variable, “Days”, and show it as a
# function of “Genre”. And then the final
# argument is the actual function we want to run on this variable. So
# let’s say, we want to look at the means.

aggregate(Days ~ Genre, film, mean) #Compute Summary Statistics of Data Subsets
# we can see that the animated movies spent the longest average of days in the theater, while
# the comedies spent the least number of days.

#Example
## Formulas, one ~ one, one ~ many, many ~ one, and many ~ many:
aggregate(weight ~ feed, data = chickwts, mean)
## Dot notation:
aggregate(. ~ Species, data = iris, mean) #calculates the mean of all columns grouped by species

# And what our ANOVA is going to tell us is, are these means significantly different from each other or not. So
# one way we can actually use aggregate to test the assumption of ANOVA, which is the groups have equal
# variances, we can actually ask for, instead of the mean, the standard deviation. So by changing that third
# argument, we’re going to get, instead of the mean days, the standard deviation for each group.
aggregate(Days ~ Genre, film, sd)

# So just eyeballing it here, we see that there’s not a whole lot of difference in the standard deviation of this
# variable for the four genres of movies. Also in the previous video, we saw the distribution of days for each
# genre looked fairly symmetric across all four categories. And also the spread of the variables was roughly
# even. We do see a larger spread for the action adventure movies, which is shown here in our largest standard
# deviation. But because they’re not so far apart, we would say that our assumptions for this one-way ANOVA
# are met.
# 
# So moving on to the actual ANOVA function, first, let’s give R a name for our model. I’ll just call it
# daysmodel; you can call it whatever you want. And then we’re going to assign to that the function, aov().
# And then all we have to give to this function is the dependent variable, “film$Days”, as a function of our
# independent variable, “Genre”. So again, notice that this notation is back to the regular notation, where we
# give it the data frame dollar sign variable name.

daysmodel <- aov(film$Days ~ film$Genre)

# In order to retrieve the information from this model, we need to
# pass it through one more function, which is called summary(). So passing our ANOVA model, “daysmodel”,
# through the summary() function, we’re going to get our ANOVA table displayed in the Console window.

summary(daysmodel)

p_value<-0.504
p_value <= .05 #reject null hypothesis if true
p_value > .05 #fail to reject null hypothesis if true

# So we can see here with a p-value greater
# than 0.05, we would fail to reject the null hypothesis that the mean
# of all four groups are equal to each other.
# So if it were the case that we had a significant p-value for this ANOVA, we would follow up this analysis
# with a post hoc analysis, where we find which means are different from each other.
# So just to illustrate that process, I want to show you one more function, which will run the Tukey pairwise
# comparisons for every single group in your model. All you have to do is give the object name, daysmodel,
# into this function, called TukeyHSD().

TukeyHSD(daysmodel)
plot(TukeyHSD(daysmodel)) #plot the differences visually

# And what it will output for you are every single pairwise comparison, the difference and the means, a lower
# and upper confidence interval for that difference, and then the p-value that is adjusted based on the Tukey
# adjustment. We can see that none of these pairwise comparisons are significant, which makes sense because
# our overall ANOVA wasn’t significant. But if you did have a significant ANOVA, you would go into this
# table and see where those differences actually occur.

###############Pre-Lab 5: Top Grossing Films  ##########################
# Primary Research Questions
# 
# 1. Does a film’s rating (PG, PG-13, or R) impact its cost to produce?
# 2. Does a film’s rating (PG, PG-13, or R) influence its IMDB score?

#Check the Data

#import data
film<-fread("https://d37djvu3ytnwxt.cloudfront.net/assets/courseware/v1/d1c5e00eb9597e12480cd507b93cbb26/asset-v1:UTAustinX+UT.7.21x+2T2017+type@asset+block/FilmData.csv")

#inspect data
glimpse(film)

#1a. What numerical rank does Titanic hold among highest-grossing films?
filter(film, Film == "Titanic")
#Answer: 2

#1b. What is the name of the highest ranked film made by Universal Studios?
#Double-check your spelling before you submit.
sqldf('SELECT DISTINCT Studio FROM film') #list all studio names

univ_studios<-film %>%
  filter(Studio=="Uni.") %>% #select only Universal Studios
  arrange(Rank) #Order by Rank in ascending order

head(univ_studios,1) #check universal studio movies
#Answer: Jurassic Park

#1c. What was the lowest IMDB rating given to a film that ranked in the top 10
#grossing films of all time?

top10_films<-film %>%
  top_n(10, Gross) %>% #select the top 10 grossing
  arrange(IMDB) #sort by IMDB score ascending

top10_films #check results
top10_films[1,] #extract top result
#Answer: 6.5--Course says the right answer is 7.6 but no way!!!!

##############Week 6:  Correlation and Regression   ##########################