Topics for today!

  1. Topic: table(),
  2. Topic: chi.sq()
  3. Topic: binom.test()
  4. Topic: prop.test()

Data

setwd("~/Desktop/R Materials/mih140/Lecture 5 - Introduction to Plotting in R")
flight_data = read.table("RegionEx_Data.txt", header = T, sep = "\t") # load in flight_data

1. Topic: Tables

# Example: Count how many observations are from MDA flights
airlines = flight_data$Airline # vector of airlines
num_mda_flights = sum(airlines == "MDA") # counts how many flights are MDA

# Alternative way: Using table(). The table function takes in a vector of catagorical data i.e.:
tab_airlines = table(airlines) 

tab_airlines / sum(tab_airlines) # table of frequencies instead of count.
## airlines
##       MDA  RegionEx 
## 0.3333333 0.6666667
# table() can take in multiple vectors of catagorical data
tab_airline_airport = table(flight_data$Airline, flight_data$Origin.airport)

# table() also understands columns of a dataframe

table(flight_data[,c("Airline", "Origin.airport")])
##           Origin.airport
## Airline    DFW MSY PNS
##   MDA       30  60  30
##   RegionEx  90 120  30

Tables cont.

# QU: Lets see if flights are uniformly distributed through out the week.
days = table(flight_data$Day.of.Week)
chisq.test(days)
## 
##  Chi-squared test for given probabilities
## 
## data:  days
## X-squared = 4, df = 6, p-value = 0.6767
days_airlines = table(flight_data$Day.of.Week, flight_data$Airline)
res = chisq.test(flight_data$Day.of.Week, flight_data$Airline)

2. Topic: Chi-squared test

#  QU: Is day.of.week and whether or not the flight is delayed independent?
tab = table(flight_data$Day.of.Week, flight_data$Delay.indicator)
chisq.test(tab)
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 274.16, df = 6, p-value < 2.2e-16

Chi-squared test cont.

# QU: Is airline and whether or not the flight is delayed independent?
tab = table(flight_data$Airline, flight_data$Delay.indicator)
results = chisq.test(tab)
results$observed
##           
##              0   1
##   MDA       86  31
##   RegionEx 177  63
results$expected
##           
##                    0        1
##   MDA       86.19328 30.80672
##   RegionEx 176.80672 63.19328

3. Topic: binom.test().

# Syntax for binom.test(): 
##   binom.test(successes, N, p, alternative = , conf.level )

# QU: Is the proportion of delays statistically signifigantly different than .2?
delays = table(flight_data$Delay.indicator)
binom.test(delays) # default with p = .5
## 
##  Exact binomial test
## 
## data:  delays
## number of successes = 263, number of trials = 357, p-value < 2.2e-16
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
##  0.6877622 0.7816522
## sample estimates:
## probability of success 
##              0.7366947
binom.test(delays[2],sum(delays),p=.2, alternative = "two.sided") # Correct test
## 
##  Exact binomial test
## 
## data:  delays[2] and sum(delays)
## number of successes = 94, number of trials = 357, p-value = 0.003562
## alternative hypothesis: true probability of success is not equal to 0.2
## 95 percent confidence interval:
##  0.2183478 0.3122378
## sample estimates:
## probability of success 
##              0.2633053

4. Topic: prop.test().

# Syntax is the same as binom.
delays = table(flight_data$Airline, flight_data$Delay.indicator)
prop.test(delays)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  delays
## X-squared = 3.6543e-30, df = 1, p-value = 1
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.10234687  0.09743234
## sample estimates:
##    prop 1    prop 2 
## 0.7350427 0.7375000
delays = table(flight_data$Delay.indicator)
prop.test(delays) # default with p = .5
## 
##  1-sample proportions test with continuity correction
## 
## data:  delays, null probability 0.5
## X-squared = 79.059, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.6872004 0.7809959
## sample estimates:
##         p 
## 0.7366947
res = prop.test(delays, alternative = "two.sided", conf.level = .95) # Correct test