Introduction To Probability and Data

Introduction To Probability and Data using R

This document uses most frequent function which can be use to do frequent data analysis functionality and usage of dpylr, dataframes and ggplot functions data sets of this course can be use afer installin library at install_github(“StatsWithR/statsr”)

I ddin’t spend much time to create this document, unlike most developrs i commented before using r codes

rm(list=ls())
if (!require("pacman")) install.packages("pacman")

## Loading required package: pacman

## Warning: package 'pacman' was built under R version 3.5.1

pacman::p_load("data.table", "devtools", "dplyr","ggplot2", "shiny", "MASS")
# this package includes all the dataset used in these excercise
#install_github("StatsWithR/statsr")
library(statsr)

#Dataset 1: Dr. Arbuthnot's Baptism Records
#The Arbuthnot data set refers to Dr. John Arbuthnot, an 18th century physician, writer, and mathematician. He was interested in the ratio of newborn boys to newborn girls

data("arbuthnot")
names(arbuthnot)

## [1] "year"  "boys"  "girls"

# Getting dimenssions of dataset
dim(arbuthnot)

## [1] 82  3

# access columns
arbuthnot$boys

##  [1] 5218 4858 4422 4994 5158 5035 5106 4917 4703 5359 5366 5518 5470 5460
## [15] 4793 4107 4047 3768 3796 3363 3079 2890 3231 3220 3196 3441 3655 3668
## [29] 3396 3157 3209 3724 4748 5216 5411 6041 5114 4678 5616 6073 6506 6278
## [43] 6449 6443 6073 6113 6058 6552 6423 6568 6247 6548 6822 6909 7577 7575
## [57] 7484 7575 7737 7487 7604 7909 7662 7602 7676 6985 7263 7632 8062 8426
## [71] 7911 7578 8102 8031 7765 6113 8366 7952 8379 8239 7840 7640

# point and line plot
ggplot(data = arbuthnot, aes(x = year, y = arbuthnot$girls)) + geom_point() + geom_line()

# adding new variable in df

arbuthnot <- arbuthnot %>% mutate(total = boys + girls)

ggplot(data = arbuthnot, aes(x = year, y = total)) + geom_point() + geom_line()

# proportion of boys
ggplot(data = arbuthnot, aes(x = year, y = boys/total)) + geom_point() + geom_line()

# some other usage

arbuthnot <- arbuthnot %>% mutate(more_boys = boys > girls)

# dataset 2 Present Birth Records
data(present)
str(present)

## Classes 'tbl_df', 'tbl' and 'data.frame':    74 obs. of  3 variables:
##  $ year : num  1940 1941 1942 1943 1944 ...
##  $ boys : num  1211684 1289734 1444365 1508959 1435301 ...
##  $ girls: num  1148715 1223693 1364631 1427901 1359499 ...

dim(present)

## [1] 74  3

range(present$year)

## [1] 1940 2013

present <- present %>% mutate(total = boys + girls)
present <- present %>% mutate(prop_boys = boys/total)
present <- present %>% mutate(more_boys = boys > girls)
ggplot(data = present, aes(x = year, y = prop_boys)) + geom_line()

# more girls every year?
present %>% group_by(more_boys) %>% summarise(count = n())

## # A tibble: 1 x 2
##   more_boys count
##   <lgl>     <int>
## 1 TRUE         74

present <- present %>% mutate(prop_boy_girl = boys/girls)
ggplot(data = present, aes(x = year, y = prop_boy_girl)) +  geom_line()

# max borns
present %>% arrange(desc(total))

## # A tibble: 74 x 7
##     year    boys   girls   total prop_boys more_boys prop_boy_girl
##    <dbl>   <dbl>   <dbl>   <dbl>     <dbl> <lgl>             <dbl>
##  1  2007 2208071 2108162 4316233     0.512 TRUE               1.05
##  2  1961 2186274 2082052 4268326     0.512 TRUE               1.05
##  3  2006 2184237 2081318 4265555     0.512 TRUE               1.05
##  4  1960 2179708 2078142 4257850     0.512 TRUE               1.05
##  5  1957 2179960 2074824 4254784     0.512 TRUE               1.05
##  6  2008 2173625 2074069 4247694     0.512 TRUE               1.05
##  7  1959 2173638 2071158 4244796     0.512 TRUE               1.05
##  8  1958 2152546 2051266 4203812     0.512 TRUE               1.05
##  9  1962 2132466 2034896 4167362     0.512 TRUE               1.05
## 10  1956 2133588 2029502 4163090     0.513 TRUE               1.05
## # ... with 64 more rows

############################## Week 2 ###############################################
# The Bureau of Transportation Statistics (BTS) is a statistical agency that is a part of the Research and Innovative Technology Administration (RITA). As its name implies, BTS collects and makes available transportation data, such as the flights data we will be working with in this lab.
# We begin by loading the nycflights data frame. Type the following in your console to load the data:
data("nycflights")
names(nycflights)

##  [1] "year"      "month"     "day"       "dep_time"  "dep_delay"
##  [6] "arr_time"  "arr_delay" "carrier"   "tailnum"   "flight"   
## [11] "origin"    "dest"      "air_time"  "distance"  "hour"     
## [16] "minute"

str(nycflights)

## Classes 'tbl_df' and 'data.frame':   32735 obs. of  16 variables:
##  $ year     : int  2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month    : int  6 5 12 5 7 1 12 8 9 4 ...
##  $ day      : int  30 7 8 14 21 1 9 13 26 30 ...
##  $ dep_time : int  940 1657 859 1841 1102 1817 1259 1920 725 1323 ...
##  $ dep_delay: num  15 -3 -1 -4 -3 -3 14 85 -10 62 ...
##  $ arr_time : int  1216 2104 1238 2122 1230 2008 1617 2032 1027 1549 ...
##  $ arr_delay: num  -4 10 11 -34 -8 3 22 71 -8 60 ...
##  $ carrier  : chr  "VX" "DL" "DL" "DL" ...
##  $ tailnum  : chr  "N626VA" "N3760C" "N712TW" "N914DL" ...
##  $ flight   : int  407 329 422 2391 3652 353 1428 1407 2279 4162 ...
##  $ origin   : chr  "JFK" "JFK" "JFK" "JFK" ...
##  $ dest     : chr  "LAX" "SJU" "LAX" "TPA" ...
##  $ air_time : num  313 216 376 135 50 138 240 48 148 110 ...
##  $ distance : num  2475 1598 2475 1005 296 ...
##  $ hour     : num  9 16 8 18 11 18 12 19 7 13 ...
##  $ minute   : num  40 57 59 41 2 17 59 20 25 23 ...

# Seven verbs
# The dplyr package offers seven verbs (functions) for basic data manipulation:
# filter()
# arrange()
# select()
# distinct()
# mutate()
# summarise()
# sample_n()
# We will use some of these functions in this lab, and learn about others in a future lab.


# plot to delay in departure
ggplot(data = nycflights, aes(x = dep_delay)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = nycflights, aes(x = dep_delay)) + geom_histogram(binwidth = 15)

ggplot(data = nycflights, aes(x = dep_delay)) + geom_histogram(binwidth = 150)

# delay to RDU airport
rdu_flights <- nycflights %>%
  filter(dest == "RDU")
ggplot(data = rdu_flights, aes(x = dep_delay)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

rdu_flights %>%
  summarise(mean_dd = mean(dep_delay), sd_dd = sd(dep_delay), n = n())

## # A tibble: 1 x 3
##   mean_dd sd_dd     n
##     <dbl> <dbl> <int>
## 1    11.7  35.6   801

# Summary statistics: Some useful function calls for summary statistics for a single numerical variable are as follows:
# mean
# median
# sd
# var
# IQR
# range
# min
# max

#flights headed to San Francisco (SFO) in February

sfo_feb <- nycflights %>% filter(dest == "SFO" & month == 2)

#or
sfo_feb_flights <- nycflights %>%
  filter(dest == "SFO", month == 2)

ggplot(data = sfo_feb, aes(x = arr_delay)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary(sfo_feb$arr_delay)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -66.00  -21.25  -11.00   -4.50    2.00  196.00

# flight delay by origin
rdu_flights %>%
  group_by(origin) %>%
  summarise(mean_dd = mean(dep_delay), sd_dd = sd(dep_delay), n = n())

## # A tibble: 3 x 4
##   origin mean_dd sd_dd     n
##   <chr>    <dbl> <dbl> <int>
## 1 EWR      13.4   32.1   145
## 2 JFK      15.4   40.3   300
## 3 LGA       7.90  32.2   356

# delay month wise
nycflights %>%
  group_by(month) %>%
  summarise(mean_dd = mean(dep_delay)) %>%
  arrange(desc(mean_dd))

## # A tibble: 12 x 2
##    month mean_dd
##    <int>   <dbl>
##  1     7   20.8 
##  2     6   20.4 
##  3    12   17.4 
##  4     4   14.6 
##  5     3   13.5 
##  6     5   13.3 
##  7     8   12.6 
##  8     2   10.7 
##  9     1   10.2 
## 10     9    6.87
## 11    11    6.10
## 12    10    5.88

# side by side box plots
ggplot(nycflights, aes(x = factor(month), y = dep_delay)) +
  geom_boxplot()

# using ifelse
#On time departure rate for NYC airports
nycflights <- nycflights %>% mutate(dep_type = ifelse(dep_delay < 5.0, "on time", "delayed"))

nycflights %>% group_by(origin) %>% summarise(ot_dep_rate = sum(dep_type == "on time")/n()) %>% arrange(desc(ot_dep_rate))

## # A tibble: 3 x 2
##   origin ot_dep_rate
##   <chr>        <dbl>
## 1 LGA          0.728
## 2 JFK          0.694
## 3 EWR          0.637

# visualize the distribution of on on time departure rate across the three airports using a segmented bar plot.
ggplot(data = nycflights, aes(x = origin, fill = dep_type)) +  geom_bar()

nycflights <- nycflights %>% mutate(avg_speed = distance/(air_time/60)) %>% arrange(desc(avg_speed))

# get first number
nycflights[1:1,'tailnum']

## # A tibble: 1 x 1
##   tailnum
##   <chr>  
## 1 N666DN

# avg speed vs distance
ggplot(data = nycflights, aes(x = distance, y = avg_speed)) + geom_point()

# Suppose you define a flight to be "on time" if it gets to the destination on time or earlier than expected, regardless of any departure delays. Mutate the data frame to create a new variable called arr_type with levels "on time" and "delayed" based on this definition. Then, determine the on time arrival percentage based on whether the flight departed on time or not. What proportion of flights that were "delayed" departing arrive "on time"? [NUMERIC INPUT]

nycflights <- nycflights %>% mutate(arr_type = ifelse(arr_delay > 0, "delayed", "on time"))

ontime_arr_ratio <- nycflights %>% summarise(prop = sum(arr_type == "on time")/n()) 
ontime_arr_ratio

## # A tibble: 1 x 1
##    prop
##   <dbl>
## 1 0.589

nycflights %>% summarise(prop = sum(arr_type == "on time" & dep_type == "delayed")/sum(dep_type == "delayed"))

## # A tibble: 1 x 1
##    prop
##   <dbl>
## 1 0.183

#################################### Simulations in R#########################################
coin_outcomes <- c("heads", "tails")
sim_fair_coin <- sample(coin_outcomes, size = 100, replace = TRUE)
table(sim_fair_coin)

## sim_fair_coin
## heads tails 
##    47    53

sim_unfair_coin <- sample(coin_outcomes, size = 100, replace = TRUE, 
                          prob = c(0.2, 0.8))
table(sim_unfair_coin)

## sim_unfair_coin
## heads tails 
##    12    88

Introduction To Probability and Data

Vineet

August 10, 2018

Introduction To Probability and Data using R