Topics for today!
- Topic: table(),
- Topic: chi.sq()
- Topic: binom.test()
- Topic: prop.test()
Data
setwd("~/Desktop/R Materials/mih140/Lecture 5 - Introduction to Plotting in R")
flight_data = read.table("RegionEx_Data.txt", header = T, sep = "\t") # load in flight_data
1. Topic: Tables
# Example: Count how many observations are from MDA flights
airlines = flight_data$Airline # vector of airlines
num_mda_flights = sum(airlines == "MDA") # counts how many flights are MDA
# Alternative way: Using table(). The table function takes in a vector of catagorical data i.e.:
tab_airlines = table(airlines)
tab_airlines / sum(tab_airlines) # table of frequencies instead of count.
## airlines
## MDA RegionEx
## 0.3333333 0.6666667
# table() can take in multiple vectors of catagorical data
tab_airline_airport = table(flight_data$Airline, flight_data$Origin.airport)
# table() also understands columns of a dataframe
table(flight_data[,c("Airline", "Origin.airport")])
## Origin.airport
## Airline DFW MSY PNS
## MDA 30 60 30
## RegionEx 90 120 30
Tables cont.
# QU: Lets see if flights are uniformly distributed through out the week.
days = table(flight_data$Day.of.Week)
chisq.test(days)
##
## Chi-squared test for given probabilities
##
## data: days
## X-squared = 4, df = 6, p-value = 0.6767
days_airlines = table(flight_data$Day.of.Week, flight_data$Airline)
res = chisq.test(flight_data$Day.of.Week, flight_data$Airline)
2. Topic: Chi-squared test
# QU: Is day.of.week and whether or not the flight is delayed independent?
tab = table(flight_data$Day.of.Week, flight_data$Delay.indicator)
chisq.test(tab)
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 274.16, df = 6, p-value < 2.2e-16
Chi-squared test cont.
# QU: Is airline and whether or not the flight is delayed independent?
tab = table(flight_data$Airline, flight_data$Delay.indicator)
results = chisq.test(tab)
results$observed
##
## 0 1
## MDA 86 31
## RegionEx 177 63
results$expected
##
## 0 1
## MDA 86.19328 30.80672
## RegionEx 176.80672 63.19328
3. Topic: binom.test().
# Syntax for binom.test():
## binom.test(successes, N, p, alternative = , conf.level )
# QU: Is the proportion of delays statistically signifigantly different than .2?
delays = table(flight_data$Delay.indicator)
binom.test(delays) # default with p = .5
##
## Exact binomial test
##
## data: delays
## number of successes = 263, number of trials = 357, p-value < 2.2e-16
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
## 0.6877622 0.7816522
## sample estimates:
## probability of success
## 0.7366947
binom.test(delays[2],sum(delays),p=.2, alternative = "two.sided") # Correct test
##
## Exact binomial test
##
## data: delays[2] and sum(delays)
## number of successes = 94, number of trials = 357, p-value = 0.003562
## alternative hypothesis: true probability of success is not equal to 0.2
## 95 percent confidence interval:
## 0.2183478 0.3122378
## sample estimates:
## probability of success
## 0.2633053
4. Topic: prop.test().
# Syntax is the same as binom.
delays = table(flight_data$Airline, flight_data$Delay.indicator)
prop.test(delays)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: delays
## X-squared = 3.6543e-30, df = 1, p-value = 1
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.10234687 0.09743234
## sample estimates:
## prop 1 prop 2
## 0.7350427 0.7375000
delays = table(flight_data$Delay.indicator)
prop.test(delays) # default with p = .5
##
## 1-sample proportions test with continuity correction
##
## data: delays, null probability 0.5
## X-squared = 79.059, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.6872004 0.7809959
## sample estimates:
## p
## 0.7366947
res = prop.test(delays, alternative = "two.sided", conf.level = .95) # Correct test