YaRrr!

Column

Ch 8

Chapter 8 Matrices and Dataframes

# Create a dataframe of boat sale data named bsale
bsale <- data.frame(name = c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"),
                    color = c("black", "green", "pink", "blue", "blue", 
                              "green", "green", "yellow", "black", "black"),
                    age = c(143, 53, 356, 23, 647, 24, 532, 43, 66, 86),
                    price = c(53, 87, 54, 66, 264, 32, 532, 58, 99, 132),
                    cost = c(52, 80, 20, 100, 189, 12, 520, 68, 80, 100),
                    stringsAsFactors = FALSE)   
                    # Avoid converting strings to factors!

# Explore the bsale dataset:
head(bsale)     # Show the first few rows

  name color age price cost
1    a black 143    53   52
2    b green  53    87   80
3    c  pink 356    54   20
4    d  blue  23    66  100
5    e  blue 647   264  189
6    f green  24    32   12

str(bsale)      # Show the structure of the data

'data.frame':   10 obs. of  5 variables:
 $ name : chr  "a" "b" "c" "d" ...
 $ color: chr  "black" "green" "pink" "blue" ...
 $ age  : num  143 53 356 23 647 24 532 43 66 86
 $ price: num  53 87 54 66 264 32 532 58 99 132
 $ cost : num  52 80 20 100 189 12 520 68 80 100

View(bsale)     # Open the data in a new window
names(bsale)    # Show the names of the columns

[1] "name"  "color" "age"   "price" "cost"

nrow(bsale)     # Show how many rows are there in the data

[1] 10

# Calculating statistics from column vectors
mean(bsale$age)       # The mean age

[1] 197.3

table(bsale$color)    # How many boats were there of each color?


 black   blue  green   pink yellow 
     3      2      3      1      1

max(bsale$price)      # The maximum price

[1] 532

# Adding new columns
bsale$id <- 1:nrow(bsale)
bsale$age.decades <- bsale$age / 10
bsale$profit <- bsale$price - bsale$cost

# The mean price of green boats
with(bsale, mean(price[color == "green"]))

[1] 217

# The names of boats older than 100 years?
with(bsale, name[age > 100])

[1] "a" "c" "e" "g"

# The percent of black boats had a positive profit?
with(subset(bsale, color == "black"), mean(profit > 0))

[1] 1

# Save only the price and cost columns in a new dataframe
bsale.2 <- bsale[c("price", "cost")]

# Change the names of the columns to "p" and "c"
names(bsale.2) <- c("p", "c")

# Use Subset to create a dataframe called old.black.bsale containing only data from black boats older than 50 years
old.black.bsale <- subset(bsale, color == "black" & age > 50)


x <- 1:5
y <- 6:10
z <- 11:15

# Create a matrix where x, y and z are columns
cbind(x, y, z)

     x  y  z
[1,] 1  6 11
[2,] 2  7 12
[3,] 3  8 13
[4,] 4  9 14
[5,] 5 10 15

# Create a matrix where x, y and z are rows
rbind(x, y, z)

  [,1] [,2] [,3] [,4] [,5]
x    1    2    3    4    5
y    6    7    8    9   10
z   11   12   13   14   15

# Creating a matrix with numeric and character columns will make everything a character:

cbind(c(1, 2, 3, 4, 5),
      c("a", "b", "c", "d", "e"))

     [,1] [,2]
[1,] "1"  "a" 
[2,] "2"  "b" 
[3,] "3"  "c" 
[4,] "4"  "d" 
[5,] "5"  "e"

# Create a matrix of the integers 1:10, with 5 rows and 2 columns

matrix(data = 1:10,
       nrow = 5,
       ncol = 2)

     [,1] [,2]
[1,]    1    6
[2,]    2    7
[3,]    3    8
[4,]    4    9
[5,]    5   10

# with 2 rows and 5 columns
matrix(data = 1:10,
       nrow = 2,
       ncol = 5)

     [,1] [,2] [,3] [,4] [,5]
[1,]    1    3    5    7    9
[2,]    2    4    6    8   10

# With 2 rows and 5 columns, but fill by row instead of columns
matrix(data = 1:10,
       nrow = 2,
       ncol = 5,
       byrow = TRUE)

     [,1] [,2] [,3] [,4] [,5]
[1,]    1    2    3    4    5
[2,]    6    7    8    9   10

# Create a dataframe of survey data

survey <- data.frame("index" = c(1, 2, 3, 4, 5),
                     "sex" = c("m", "m", "m", "f", "f"),
                     "age" = c(99, 46, 23, 54, 23))
survey

  index sex age
1     1   m  99
2     2   m  46
3     3   m  23
4     4   f  54
5     5   f  23

# Show me the structure of the survey dataframe
str(survey)

'data.frame':   5 obs. of  3 variables:
 $ index: num  1 2 3 4 5
 $ sex  : Factor w/ 2 levels "f","m": 2 2 2 1 1
 $ age  : num  99 46 23 54 23

#R converted sex to a factor with only two possible levels

# Create a dataframe of survey data without factors
survey <- data.frame("index" = c(1, 2, 3, 4, 5),
                     "sex" = c("m", "m", "m", "f", "f"),
                     "age" = c(99, 46, 23, 54, 23),
                     stringsAsFactors = FALSE)

#check the structure if if there is factor
str(survey)

'data.frame':   5 obs. of  3 variables:
 $ index: num  1 2 3 4 5
 $ sex  : chr  "m" "m" "m" "f" ...
 $ age  : num  99 46 23 54 23

#To see a complete list of all the datasets included in the datasets package, run the code: library(help = "datasets")

library(help = "datasets")


##8.3 Matrix and dataframe function

# head() shows the first few rows
head(ChickWeight)

  weight Time Chick Diet
1     42    0     1    1
2     51    2     1    1
3     59    4     1    1
4     64    6     1    1
5     76    8     1    1
6     93   10     1    1

# tail() shows the last few rows
tail(ChickWeight)

    weight Time Chick Diet
573    155   12    50    4
574    175   14    50    4
575    205   16    50    4
576    234   18    50    4
577    264   20    50    4
578    264   21    50    4

# View() opens the entire dataframe in a new window
View(ChickWeight)
# Print summary statistics of ToothGrowth to the console
summary(ToothGrowth)

      len        supp         dose      
 Min.   : 4.20   OJ:30   Min.   :0.500  
 1st Qu.:13.07   VC:30   1st Qu.:0.500  
 Median :19.25           Median :1.000  
 Mean   :18.81           Mean   :1.167  
 3rd Qu.:25.27           3rd Qu.:2.000  
 Max.   :33.90           Max.   :2.000

# Print additional information about ToothGrowth to the console
str(ToothGrowth)

'data.frame':   60 obs. of  3 variables:
 $ len : num  4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
 $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
 $ dose: num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...

##8.4 Dataframe column names

# The names of columns in the ToothGrowth dataframe
names(ToothGrowth)

[1] "len"  "supp" "dose"

# Return the len column of ToothGrowth
ToothGrowth$len

 [1]  4.2 11.5  7.3  5.8  6.4 10.0 11.2 11.2  5.2  7.0 16.5 16.5 15.2 17.3
[15] 22.5 17.3 13.6 14.5 18.8 15.5 23.6 18.5 33.9 25.5 26.4 32.5 26.7 21.5
[29] 23.3 29.5 15.2 21.5 17.6  9.7 14.5 10.0  8.2  9.4 16.5  9.7 19.7 23.3
[43] 23.6 26.4 20.0 25.2 25.8 21.2 14.5 27.3 25.5 26.4 22.4 24.5 24.8 30.9
[57] 26.4 27.3 29.4 23.0

# The mean of the len column of ToothGrowth?
mean(ToothGrowth$len)

[1] 18.81333

# Table of the supp column of ToothGrowth.
table(ToothGrowth$supp)


OJ VC 
30 30

#The len AND supp columns of ToothGrowth
head(ToothGrowth[c("len", "supp")])

   len supp
1  4.2   VC
2 11.5   VC
3  7.3   VC
4  5.8   VC
5  6.4   VC
6 10.0   VC

# Create a new dataframe called survey
survey <- data.frame("index" = c(1, 2, 3, 4, 5),
                     "age" = c(24, 25, 42, 56, 22))

survey

  index age
1     1  24
2     2  25
3     3  42
4     4  56
5     5  22

# Add a new column called sex to survey
survey$sex <- c("m", "m", "f", "f", "m")

#survey with new sex column
survey

  index age sex
1     1  24   m
2     2  25   m
3     3  42   f
4     4  56   f
5     5  22   m

#Change name of 1st column of df to "a":   names(df)[1] <- "a"

#Change name of 2nd column of df to "b": names(df)[2] <- "b"

# Change the name of the first column of survey to "participant.number"
names(survey)[1] <- "participant.number"
survey

  participant.number age sex
1                  1  24   m
2                  2  25   m
3                  3  42   f
4                  4  56   f
5                  5  22   m

# Change the column name from age to age.years
names(survey)[names(survey) == "age"] <- "years"
survey

  participant.number years sex
1                  1    24   m
2                  2    25   m
3                  3    42   f
4                  4    56   f
5                  5    22   m

##8.5 Slicing dataframes

# Return row 1   df[1, ]

# Return column 5   df[, 5]

# Rows 1:5 and column 2  df[1:5, 2]

# Show the rows 1-6 and column 1 of ToothGrowth
ToothGrowth[1:6, 1]

[1]  4.2 11.5  7.3  5.8  6.4 10.0

# Show the rows 1-3 and columns 1 and 3 of ToothGrowth
ToothGrowth[1:3, c(1,3)]

   len dose
1  4.2  0.5
2 11.5  0.5
3  7.3  0.5

# Show the 1st row (and all columns) of ToothGrowth
ToothGrowth[1, ]

  len supp dose
1 4.2   VC  0.5

# Show the 2nd column (and all rows) of ToothGrowth
ToothGrowth[, 2]

 [1] VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC
[24] VC VC VC VC VC VC VC OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ
[47] OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ
Levels: OJ VC

# Create a new df with only the rows of ToothGrowth where supp equals VC
ToothGrowth.VC <- ToothGrowth[ToothGrowth$supp == "VC", ]


# Create a new df with only the rows of ToothGrowth where supp equals OJ and dose < 1
ToothGrowth.OJ.a <- ToothGrowth[ToothGrowth$supp == "OJ" &
                                ToothGrowth$dose < 1, ]




# Show rows of ToothGrowth where len < 20 AND supp == "OJ" AND dose >= 1
subset(x = ToothGrowth,
      subset = len < 20 &
               supp == "OJ" &
               dose >= 1)

    len supp dose
41 19.7   OJ    1
49 14.5   OJ    1

# Get rows of ToothGrowth where len > 30 AND supp == "VC", but only return the len and dose columns
subset(x = ToothGrowth,
    subset = len > 30 & supp == "VC",
    select = c(len, dose))

    len dose
23 33.9    2
26 32.5    2

#8.6 Combining slicing with functions

# The mean tooth length of Guinea pigs given OJ

# Step 1: Create a subsettted dataframe called oj

oj <- subset(x = ToothGrowth,
             subset = supp == "OJ")

# Step 2: Calculate the mean of the len column from the new subsetted dataset

mean(oj$len)

[1] 20.66333

# Get the same solution using logical indexing

# Step 1: Create a subsettted dataframe called oj
oj <- ToothGrowth[ToothGrowth$supp == "OJ",]

# Step 2: Calculate the mean of the len column from the new subsetted dataset
mean(oj$len)

[1] 20.66333

# Only referring to column vectors
mean(ToothGrowth$len[ToothGrowth$supp == "OJ"])

[1] 20.66333

#The with() function helps to save you some typing when you are using multiple columns from a dataframe. Specifically, it allows you to specify a dataframe (or any other object in R) once at the beginning of a line – then, for every object you refer to in the code in that line, R will assume you’re referring to that object in an expression.

#create a dataframe called health
health <- data.frame("age" = c(32, 24, 43, 19, 43),
                     "height" = c(1.75, 1.65, 1.50, 1.92, 1.80),
                     "weight" = c(70, 65, 62, 79, 85))


#Add a new column called bmi which represents a person’s body mass index (BMI)
health$weight / health$height ^ 2

[1] 22.85714 23.87511 27.55556 21.43012 26.23457

#use with function 
# Save typing by using with()
with(health, weight / height ^ 2)

[1] 22.85714 23.87511 27.55556 21.43012 26.23457

Ch 9

Chapter 9 Importing, saving and managing data

#print the current working directory
getwd()

[1] "C:/Users/L5M/Desktop"

#change the working directory to diferent path
#setwd(dir= )

#see all the objects defined in current workspace
ls()

 [1] "bsale"            "bsale.2"          "health"          
 [4] "oj"               "old.black.bsale"  "survey"          
 [7] "ToothGrowth.OJ.a" "ToothGrowth.VC"   "x"               
[10] "y"                "z"

# Create objects to save later
study1.df<-data.frame(id = 1:5, 
                        sex = c("m", "m", "f", "f", "m"), 
                        score = c(51, 20, 67, 52, 42))

score.by.sex<-aggregate(score ~ sex, 
                          FUN = mean, 
                          data = study1.df)

study1.htest<-t.test(score ~ sex, 
                       data = study1.df)



# Save two objects as a new .RData file in the data folder of my current working directory
#save(study1.df, score.by.sex, study1.htest,file = "data")
#To save all the objects in your workspace as a .RData file, use the "save.image()" function
#To load an .RData file, that is, to import all of the objects contained in the .RData file into your current workspace, use the load() function
#To remove objects from your workspace, use the "rm()" function


# Read a text file from the web
fromweb <- read.table(file = 'http://goo.gl/jTNf6P',
                      sep = '\t',
                      header = TRUE)

# Print the result
fromweb

           message randomdata
1 Congratulations!          1
2              you          2
3             just          3
4       downloaded          4
5             this          5
6            table          6
7             from          7
8              the          8
9             web!          9

a <- data.frame("sex" = c("m", "f", "m"),
                "age" = c(19, 43, 25),
                "favorite.movie" = c("Moon", "The Goonies", "Spice World"))
b <- mean(a$age)
c <- table(a$sex)

Exploratory Data Analysis with R

Column

Ch4

Exploratory Data Analysis Checklist

library(readr)
ozone<- read_csv("C:/Users/L5M/Desktop/HU/EDA/EDA_R_project/US EPA data 2017.csv")
names(ozone) <- make.names(names(ozone))
#rewrite the names of the columns to remove any spaces

# check the number or columns and row
nrow(ozone)

[1] 66869

ncol(ozone)

[1] 55

#examine the classes of each of the columns
str(ozone)

Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame':    66869 obs. of  55 variables:
 $ State.Code                    : chr  "01" "01" "01" "01" ...
 $ County.Code                   : chr  "003" "003" "003" "003" ...
 $ Site.Num                      : chr  "0010" "0010" "0010" "0010" ...
 $ Parameter.Code                : num  44201 44201 44201 44201 88101 ...
 $ POC                           : num  1 1 1 1 1 1 1 1 1 1 ...
 $ Latitude                      : num  30.5 30.5 30.5 30.5 30.5 ...
 $ Longitude                     : num  -87.9 -87.9 -87.9 -87.9 -87.9 ...
 $ Datum                         : chr  "NAD83" "NAD83" "NAD83" "NAD83" ...
 $ Parameter.Name                : chr  "Ozone" "Ozone" "Ozone" "Ozone" ...
 $ Sample.Duration               : chr  "1 HOUR" "8-HR RUN AVG BEGIN HOUR" "8-HR RUN AVG BEGIN HOUR" "8-HR RUN AVG BEGIN HOUR" ...
 $ Pollutant.Standard            : chr  "Ozone 1-hour 1979" "Ozone 8-Hour 1997" "Ozone 8-Hour 2008" "Ozone 8-hour 2015" ...
 $ Metric.Used                   : chr  "Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM)" "Daily maximum of 8 hour running average of observed hourly values" "Daily maximum of 8 hour running average of observed hourly values" "Daily maximum of 8-hour running average" ...
 $ Method.Name                   : chr  "INSTRUMENTAL - ULTRA VIOLET ABSORPTION" NA NA NA ...
 $ Year                          : num  2017 2017 2017 2017 2017 ...
 $ Units.of.Measure              : chr  "Parts per million" "Parts per million" "Parts per million" "Parts per million" ...
 $ Event.Type                    : chr  "No Events" "No Events" "No Events" "No Events" ...
 $ Observation.Count             : num  5240 5426 5426 3852 110 ...
 $ Observation.Percent           : num  93 91 91 90 90 90 90 90 97 97 ...
 $ Completeness.Indicator        : chr  "Y" "Y" "Y" "Y" ...
 $ Valid.Day.Count               : num  227 222 222 221 110 110 110 110 118 118 ...
 $ Required.Day.Count            : num  245 245 245 245 122 122 122 122 122 122 ...
 $ Exceptional.Data.Count        : num  0 0 0 0 0 0 0 0 0 0 ...
 $ Null.Data.Count               : num  640 0 0 0 12 12 12 12 6 6 ...
 $ Primary.Exceedance.Count      : num  0 0 0 1 0 0 NA NA 0 0 ...
 $ Secondary.Exceedance.Count    : num  0 0 0 1 0 0 NA NA 0 0 ...
 $ Certification.Indicator       : chr  "Certified" "Certified" "Certified" "Certified" ...
 $ Num.Obs.Below.MDL             : num  0 0 0 0 0 0 0 0 0 0 ...
 $ Arithmetic.Mean               : num  0.0443 0.0401 0.0401 0.0399 7.3918 ...
 $ Arithmetic.Standard.Dev       : num  0.0122 0.0116 0.0116 0.0116 3.5723 ...
 $ X1st.Max.Value                : num  0.078 0.073 0.073 0.073 19.7 19.7 19.7 19.7 19 19 ...
 $ X1st.Max.DateTime             : POSIXct, format: "2017-05-08 15:00:00" "2017-05-08 14:00:00" ...
 $ X2nd.Max.Value                : num  0.078 0.07 0.07 0.07 18.9 18.9 18.9 18.9 17.4 17.4 ...
 $ X2nd.Max.DateTime             : POSIXct, format: "2017-05-09 14:00:00" "2017-05-09 09:00:00" ...
 $ X3rd.Max.Value                : num  0.072 0.067 0.067 0.067 18.9 18.9 18.9 18.9 16.8 16.8 ...
 $ X3rd.Max.DateTime             : POSIXct, format: "2017-05-10 14:00:00" "2017-05-10 09:00:00" ...
 $ X4th.Max.Value                : num  0.067 0.064 0.064 0.064 16 16 16 16 16.3 16.3 ...
 $ X4th.Max.DateTime             : POSIXct, format: "2017-04-25 15:00:00" "2017-05-07 13:00:00" ...
 $ X1st.Max.Non.Overlapping.Value: num  NA NA NA NA NA NA NA NA NA NA ...
 $ X1st.NO.Max.DateTime          : POSIXct, format: NA NA ...
 $ X2nd.Max.Non.Overlapping.Value: num  NA NA NA NA NA NA NA NA NA NA ...
 $ X2nd.NO.Max.DateTime          : POSIXct, format: NA NA ...
 $ X99th.Percentile              : num  0.072 0.067 0.067 0.067 18.9 18.9 18.9 18.9 17.4 17.4 ...
 $ X98th.Percentile              : num  0.067 0.062 0.062 0.062 18.9 18.9 18.9 18.9 16.8 16.8 ...
 $ X95th.Percentile              : num  0.063 0.058 0.058 0.058 15.4 15.4 15.4 15.4 15 15 ...
 $ X90th.Percentile              : num  0.06 0.054 0.054 0.054 11.8 11.8 11.8 11.8 12.5 12.5 ...
 $ X75th.Percentile              : num  0.053 0.049 0.049 0.048 8.9 8.9 8.9 8.9 10.4 10.4 ...
 $ X50th.Percentile              : num  0.044 0.04 0.04 0.04 7 7 7 7 7.2 7.2 ...
 $ X10th.Percentile              : num  0.027 0.024 0.024 0.024 3.8 3.8 3.8 3.8 3.8 3.8 ...
 $ Local.Site.Name               : chr  "FAIRHOPE, Alabama" "FAIRHOPE, Alabama" "FAIRHOPE, Alabama" "FAIRHOPE, Alabama" ...
 $ Address                       : chr  "FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE,  ALABAMA" "FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE,  ALABAMA" "FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE,  ALABAMA" "FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE,  ALABAMA" ...
 $ State.Name                    : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
 $ County.Name                   : chr  "Baldwin" "Baldwin" "Baldwin" "Baldwin" ...
 $ City.Name                     : chr  "Fairhope" "Fairhope" "Fairhope" "Fairhope" ...
 $ CBSA.Name                     : chr  "Daphne-Fairhope-Foley, AL" "Daphne-Fairhope-Foley, AL" "Daphne-Fairhope-Foley, AL" "Daphne-Fairhope-Foley, AL" ...
 $ Date.of.Last.Change           : Date, format: "2018-07-18" "2018-07-18" ...
 - attr(*, "spec")=
  .. cols(
  ..   `State Code` = col_character(),
  ..   `County Code` = col_character(),
  ..   `Site Num` = col_character(),
  ..   `Parameter Code` = col_double(),
  ..   POC = col_double(),
  ..   Latitude = col_double(),
  ..   Longitude = col_double(),
  ..   Datum = col_character(),
  ..   `Parameter Name` = col_character(),
  ..   `Sample Duration` = col_character(),
  ..   `Pollutant Standard` = col_character(),
  ..   `Metric Used` = col_character(),
  ..   `Method Name` = col_character(),
  ..   Year = col_double(),
  ..   `Units of Measure` = col_character(),
  ..   `Event Type` = col_character(),
  ..   `Observation Count` = col_double(),
  ..   `Observation Percent` = col_double(),
  ..   `Completeness Indicator` = col_character(),
  ..   `Valid Day Count` = col_double(),
  ..   `Required Day Count` = col_double(),
  ..   `Exceptional Data Count` = col_double(),
  ..   `Null Data Count` = col_double(),
  ..   `Primary Exceedance Count` = col_double(),
  ..   `Secondary Exceedance Count` = col_double(),
  ..   `Certification Indicator` = col_character(),
  ..   `Num Obs Below MDL` = col_double(),
  ..   `Arithmetic Mean` = col_double(),
  ..   `Arithmetic Standard Dev` = col_double(),
  ..   `1st Max Value` = col_double(),
  ..   `1st Max DateTime` = col_datetime(format = ""),
  ..   `2nd Max Value` = col_double(),
  ..   `2nd Max DateTime` = col_datetime(format = ""),
  ..   `3rd Max Value` = col_double(),
  ..   `3rd Max DateTime` = col_datetime(format = ""),
  ..   `4th Max Value` = col_double(),
  ..   `4th Max DateTime` = col_datetime(format = ""),
  ..   `1st Max Non Overlapping Value` = col_double(),
  ..   `1st NO Max DateTime` = col_datetime(format = ""),
  ..   `2nd Max Non Overlapping Value` = col_double(),
  ..   `2nd NO Max DateTime` = col_datetime(format = ""),
  ..   `99th Percentile` = col_double(),
  ..   `98th Percentile` = col_double(),
  ..   `95th Percentile` = col_double(),
  ..   `90th Percentile` = col_double(),
  ..   `75th Percentile` = col_double(),
  ..   `50th Percentile` = col_double(),
  ..   `10th Percentile` = col_double(),
  ..   `Local Site Name` = col_character(),
  ..   Address = col_character(),
  ..   `State Name` = col_character(),
  ..   `County Name` = col_character(),
  ..   `City Name` = col_character(),
  ..   `CBSA Name` = col_character(),
  ..   `Date of Last Change` = col_date(format = "")
  .. )

#Look at the top 6 rows of data
head(ozone)

# A tibble: 6 x 55
  State.Code County.Code Site.Num Parameter.Code   POC Latitude Longitude
  <chr>      <chr>       <chr>             <dbl> <dbl>    <dbl>     <dbl>
1 01         003         0010              44201     1     30.5     -87.9
2 01         003         0010              44201     1     30.5     -87.9
3 01         003         0010              44201     1     30.5     -87.9
4 01         003         0010              44201     1     30.5     -87.9
5 01         003         0010              88101     1     30.5     -87.9
6 01         003         0010              88101     1     30.5     -87.9
# ... with 48 more variables: Datum <chr>, Parameter.Name <chr>,
#   Sample.Duration <chr>, Pollutant.Standard <chr>, Metric.Used <chr>,
#   Method.Name <chr>, Year <dbl>, Units.of.Measure <chr>,
#   Event.Type <chr>, Observation.Count <dbl>, Observation.Percent <dbl>,
#   Completeness.Indicator <chr>, Valid.Day.Count <dbl>,
#   Required.Day.Count <dbl>, Exceptional.Data.Count <dbl>,
#   Null.Data.Count <dbl>, Primary.Exceedance.Count <dbl>,
#   Secondary.Exceedance.Count <dbl>, Certification.Indicator <chr>,
#   Num.Obs.Below.MDL <dbl>, Arithmetic.Mean <dbl>,
#   Arithmetic.Standard.Dev <dbl>, X1st.Max.Value <dbl>,
#   X1st.Max.DateTime <dttm>, X2nd.Max.Value <dbl>,
#   X2nd.Max.DateTime <dttm>, X3rd.Max.Value <dbl>,
#   X3rd.Max.DateTime <dttm>, X4th.Max.Value <dbl>,
#   X4th.Max.DateTime <dttm>, X1st.Max.Non.Overlapping.Value <dbl>,
#   X1st.NO.Max.DateTime <dttm>, X2nd.Max.Non.Overlapping.Value <dbl>,
#   X2nd.NO.Max.DateTime <dttm>, X99th.Percentile <dbl>,
#   X98th.Percentile <dbl>, X95th.Percentile <dbl>,
#   X90th.Percentile <dbl>, X75th.Percentile <dbl>,
#   X50th.Percentile <dbl>, X10th.Percentile <dbl>, Local.Site.Name <chr>,
#   Address <chr>, State.Name <chr>, County.Name <chr>, City.Name <chr>,
#   CBSA.Name <chr>, Date.of.Last.Change <date>

#Look at the botom 6 rows of specific columns 
tail(ozone[, c(6:7, 10)])

# A tibble: 6 x 3
  Latitude Longitude Sample.Duration        
     <dbl>     <dbl> <chr>                  
1     32.7     -115. 24 HOUR                
2     32.7     -115. 24 HOUR                
3     32.5     -115. 1 HOUR                 
4     32.5     -115. 8-HR RUN AVG BEGIN HOUR
5     32.5     -115. 8-HR RUN AVG BEGIN HOUR
6     32.5     -115. 8-HR RUN AVG BEGIN HOUR

#take a look at the Stat.Code variable to see what time measurements are recorded as being taken

table(ozone$State.Code)


  01   02   04   05   06   08   09   10   11   12   13   15   16   17   18 
 677  547 2031  340 9174 2452  387  270  405 1832 1537  470  548 1274 2282 
  19   20   21   22   23   24   25   26   27   28   29   30   31   32   33 
 954  419 1075  599  912  952 1187 2051 1099  287 1142 1052  305  697  466 
  34   35   36   37   38   39   40   41   42   44   45   46   47   48   49 
1273  813 2569 1961  535 2740 1427 1256 2356  709  682  338  743 3922 1337 
  50   51   53   54   55   56   72   78   80 
 626  842 1793  445 1127 1564  229   56  103

library(dplyr)
#use filter function to filter dataset
filter(ozone,ozone$State.Code=="78" )

# A tibble: 56 x 55
   State.Code County.Code Site.Num Parameter.Code   POC Latitude Longitude
   <chr>      <chr>       <chr>             <dbl> <dbl>    <dbl>     <dbl>
 1 78         010         0012              68105     1     17.7     -64.8
 2 78         010         0012              68108     1     17.7     -64.8
 3 78         010         0012              85101     1     17.7     -64.8
 4 78         010         0012              85101     2     17.7     -64.8
 5 78         010         0012              88101     1     17.7     -64.8
 6 78         010         0012              88101     1     17.7     -64.8
 7 78         010         0012              88101     1     17.7     -64.8
 8 78         010         0012              88101     1     17.7     -64.8
 9 78         010         0012              88501     3     17.7     -64.8
10 78         010         0012              88501     3     17.7     -64.8
# ... with 46 more rows, and 48 more variables: Datum <chr>,
#   Parameter.Name <chr>, Sample.Duration <chr>, Pollutant.Standard <chr>,
#   Metric.Used <chr>, Method.Name <chr>, Year <dbl>,
#   Units.of.Measure <chr>, Event.Type <chr>, Observation.Count <dbl>,
#   Observation.Percent <dbl>, Completeness.Indicator <chr>,
#   Valid.Day.Count <dbl>, Required.Day.Count <dbl>,
#   Exceptional.Data.Count <dbl>, Null.Data.Count <dbl>,
#   Primary.Exceedance.Count <dbl>, Secondary.Exceedance.Count <dbl>,
#   Certification.Indicator <chr>, Num.Obs.Below.MDL <dbl>,
#   Arithmetic.Mean <dbl>, Arithmetic.Standard.Dev <dbl>,
#   X1st.Max.Value <dbl>, X1st.Max.DateTime <dttm>, X2nd.Max.Value <dbl>,
#   X2nd.Max.DateTime <dttm>, X3rd.Max.Value <dbl>,
#   X3rd.Max.DateTime <dttm>, X4th.Max.Value <dbl>,
#   X4th.Max.DateTime <dttm>, X1st.Max.Non.Overlapping.Value <dbl>,
#   X1st.NO.Max.DateTime <dttm>, X2nd.Max.Non.Overlapping.Value <dbl>,
#   X2nd.NO.Max.DateTime <dttm>, X99th.Percentile <dbl>,
#   X98th.Percentile <dbl>, X95th.Percentile <dbl>,
#   X90th.Percentile <dbl>, X75th.Percentile <dbl>,
#   X50th.Percentile <dbl>, X10th.Percentile <dbl>, Local.Site.Name <chr>,
#   Address <chr>, State.Name <chr>, County.Name <chr>, City.Name <chr>,
#   CBSA.Name <chr>, Date.of.Last.Change <date>

#select specific range of data
select(ozone, State.Name) %>% unique %>% nrow

[1] 54

#unique elements
unique(ozone$State.Name)

 [1] "Alabama"              "Alaska"               "Arizona"             
 [4] "Arkansas"             "California"           "Colorado"            
 [7] "Connecticut"          "Delaware"             "District Of Columbia"
[10] "Florida"              "Georgia"              "Hawaii"              
[13] "Idaho"                "Illinois"             "Indiana"             
[16] "Iowa"                 "Kansas"               "Kentucky"            
[19] "Louisiana"            "Maine"                "Maryland"            
[22] "Massachusetts"        "Michigan"             "Minnesota"           
[25] "Mississippi"          "Missouri"             "Montana"             
[28] "Nebraska"             "Nevada"               "New Hampshire"       
[31] "New Jersey"           "New Mexico"           "New York"            
[34] "North Carolina"       "North Dakota"         "Ohio"                
[37] "Oklahoma"             "Oregon"               "Pennsylvania"        
[40] "Rhode Island"         "South Carolina"       "South Dakota"        
[43] "Tennessee"            "Texas"                "Utah"                
[46] "Vermont"              "Virginia"             "Washington"          
[49] "West Virginia"        "Wisconsin"            "Wyoming"             
[52] "Puerto Rico"          "Virgin Islands"       "Country Of Mexico"

#statistic summary for specific variable
summary(ozone$Arithmetic.Mean)

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
  -14.63     0.04     0.40    43.99     7.72 50128.95

quantile(ozone$Arithmetic.Mean)

          0%          25%          50%          75%         100% 
  -14.626471     0.036608     0.400000     7.723881 50128.946050

R for Data Science

Column

Ch5

Data Transformaion

library(nycflights13)
library(tidyverse)

flights

# A tibble: 336,776 x 19
    year month   day dep_time sched_dep_time dep_delay arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>
 1  2013     1     1      517            515         2      830
 2  2013     1     1      533            529         4      850
 3  2013     1     1      542            540         2      923
 4  2013     1     1      544            545        -1     1004
 5  2013     1     1      554            600        -6      812
 6  2013     1     1      554            558        -4      740
 7  2013     1     1      555            600        -5      913
 8  2013     1     1      557            600        -3      709
 9  2013     1     1      557            600        -3      838
10  2013     1     1      558            600        -2      753
# ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
#   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>

##dplyr
#Pick observations by their values (filter()).
#Reorder the rows (arrange()).
#Pick variables by their names (select()).
#Create new variables with functions of existing variables (mutate()).
#Collapse many values down to a single summary (summarise()).


##filter
#filter() allows you to subset observations based on their values. The first argument is the name of the data frame. The second and subsequent arguments are the expressions that filter the data frame. 

jan1 <- filter(flights, month == 1, day == 1)

(dec25 <- filter(flights, month == 12, day == 25))

# A tibble: 719 x 19
    year month   day dep_time sched_dep_time dep_delay arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>
 1  2013    12    25      456            500        -4      649
 2  2013    12    25      524            515         9      805
 3  2013    12    25      542            540         2      832
 4  2013    12    25      546            550        -4     1022
 5  2013    12    25      556            600        -4      730
 6  2013    12    25      557            600        -3      743
 7  2013    12    25      557            600        -3      818
 8  2013    12    25      559            600        -1      855
 9  2013    12    25      559            600        -1      849
10  2013    12    25      600            600         0      850
# ... with 709 more rows, and 12 more variables: sched_arr_time <int>,
#   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>

#wrap the assignment in parentheses

near(sqrt(2) ^ 2,  2)

[1] TRUE

near(1 / 49 * 49, 1)

[1] TRUE

#Computers use finite precision arithmetic, so remember that every number you see is an approximation. Instead of relying on ==, use near()

#Finds all flights that departed in November or December
filter(flights, month == 11 | month == 12)

# A tibble: 55,403 x 19
    year month   day dep_time sched_dep_time dep_delay arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>
 1  2013    11     1        5           2359         6      352
 2  2013    11     1       35           2250       105      123
 3  2013    11     1      455            500        -5      641
 4  2013    11     1      539            545        -6      856
 5  2013    11     1      542            545        -3      831
 6  2013    11     1      549            600       -11      912
 7  2013    11     1      550            600       -10      705
 8  2013    11     1      554            600        -6      659
 9  2013    11     1      554            600        -6      826
10  2013    11     1      554            600        -6      749
# ... with 55,393 more rows, and 12 more variables: sched_arr_time <int>,
#   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>

#"x %in% y" will select every row where x is one of the values in y
nov_dec <- filter(flights, month %in% c(11, 12))

filter(flights, !(arr_delay > 120 | dep_delay > 120))

# A tibble: 316,050 x 19
    year month   day dep_time sched_dep_time dep_delay arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>
 1  2013     1     1      517            515         2      830
 2  2013     1     1      533            529         4      850
 3  2013     1     1      542            540         2      923
 4  2013     1     1      544            545        -1     1004
 5  2013     1     1      554            600        -6      812
 6  2013     1     1      554            558        -4      740
 7  2013     1     1      555            600        -5      913
 8  2013     1     1      557            600        -3      709
 9  2013     1     1      557            600        -3      838
10  2013     1     1      558            600        -2      753
# ... with 316,040 more rows, and 12 more variables: sched_arr_time <int>,
#   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>

filter(flights, arr_delay <= 120, dep_delay <= 120)

# A tibble: 316,050 x 19
    year month   day dep_time sched_dep_time dep_delay arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>
 1  2013     1     1      517            515         2      830
 2  2013     1     1      533            529         4      850
 3  2013     1     1      542            540         2      923
 4  2013     1     1      544            545        -1     1004
 5  2013     1     1      554            600        -6      812
 6  2013     1     1      554            558        -4      740
 7  2013     1     1      555            600        -5      913
 8  2013     1     1      557            600        -3      709
 9  2013     1     1      557            600        -3      838
10  2013     1     1      558            600        -2      753
# ... with 316,040 more rows, and 12 more variables: sched_arr_time <int>,
#   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>

#De Morgan’s law: !(x & y) is the same as !x | !y, and !(x | y) is the same as !x & !y

# Missing Value
NA > 5

[1] NA

NA + 10

[1] NA

NA / 2

[1] NA

#to determine if a value is missing, use "is.na()"
x<- NA
is.na(x)

[1] TRUE

df <- tibble(x = c(1, NA, 3))
filter(df, x > 1)

# A tibble: 1 x 1
      x
  <dbl>
1     3

#filter() only includes rows where the condition is TRUE; it excludes both FALSE and NA values. If you want to preserve missing values, ask for them explicitly
filter(df, is.na(x) | x > 1)

# A tibble: 2 x 1
      x
  <dbl>
1    NA
2     3

##Arrange
#arrange() works similarly to filter() except that instead of selecting rows, it changes their order

arrange(flights, year, month, day)

# A tibble: 336,776 x 19
    year month   day dep_time sched_dep_time dep_delay arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>
 1  2013     1     1      517            515         2      830
 2  2013     1     1      533            529         4      850
 3  2013     1     1      542            540         2      923
 4  2013     1     1      544            545        -1     1004
 5  2013     1     1      554            600        -6      812
 6  2013     1     1      554            558        -4      740
 7  2013     1     1      555            600        -5      913
 8  2013     1     1      557            600        -3      709
 9  2013     1     1      557            600        -3      838
10  2013     1     1      558            600        -2      753
# ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
#   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>

#If provide more than one column name, each additional column will be used to break ties in the values of preceding columns

#desc() to re-order by a column in descending order
arrange(flights, desc(dep_delay))

# A tibble: 336,776 x 19
    year month   day dep_time sched_dep_time dep_delay arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>
 1  2013     1     9      641            900      1301     1242
 2  2013     6    15     1432           1935      1137     1607
 3  2013     1    10     1121           1635      1126     1239
 4  2013     9    20     1139           1845      1014     1457
 5  2013     7    22      845           1600      1005     1044
 6  2013     4    10     1100           1900       960     1342
 7  2013     3    17     2321            810       911      135
 8  2013     6    27      959           1900       899     1236
 9  2013     7    22     2257            759       898      121
10  2013    12     5      756           1700       896     1058
# ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
#   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>

#Missing values are always sorted at the end
df <- tibble(x = c(5, 2, NA))
arrange(df, x)

# A tibble: 3 x 1
      x
  <dbl>
1     2
2     5
3    NA

arrange(df, desc(x))

# A tibble: 3 x 1
      x
  <dbl>
1     5
2     2
3    NA

## select
# allows you to rapidly zoom in on a useful subset using operations based on the names of the variables

# Select columns by name
select(flights, year, month, day)

# A tibble: 336,776 x 3
    year month   day
   <int> <int> <int>
 1  2013     1     1
 2  2013     1     1
 3  2013     1     1
 4  2013     1     1
 5  2013     1     1
 6  2013     1     1
 7  2013     1     1
 8  2013     1     1
 9  2013     1     1
10  2013     1     1
# ... with 336,766 more rows

# Select all columns between year and day (inclusive)
select(flights, year:day)

# A tibble: 336,776 x 3
    year month   day
   <int> <int> <int>
 1  2013     1     1
 2  2013     1     1
 3  2013     1     1
 4  2013     1     1
 5  2013     1     1
 6  2013     1     1
 7  2013     1     1
 8  2013     1     1
 9  2013     1     1
10  2013     1     1
# ... with 336,766 more rows

# Select all columns except those from year to day (exclusive)
select(flights, -(year:day))

# A tibble: 336,776 x 16
   dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
      <int>          <int>     <dbl>    <int>          <int>     <dbl>
 1      517            515         2      830            819        11
 2      533            529         4      850            830        20
 3      542            540         2      923            850        33
 4      544            545        -1     1004           1022       -18
 5      554            600        -6      812            837       -25
 6      554            558        -4      740            728        12
 7      555            600        -5      913            854        19
 8      557            600        -3      709            723       -14
 9      557            600        -3      838            846        -8
10      558            600        -2      753            745         8
# ... with 336,766 more rows, and 10 more variables: carrier <chr>,
#   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
#   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

#starts_with("abc"): matches names that begin with “abc”.
#ends_with("xyz"): matches names that end with “xyz”.
#contains("ijk"): matches names that contain “ijk”.
#matches("(.)\\1"): selects variables that match a regular expression. This one matches any variables that contain repeated characters. You’ll learn more about regular expressions in strings.
#num_range("x", 1:3): matches x1, x2 and x3.

rename(flights, tail_num = tailnum)

# A tibble: 336,776 x 19
    year month   day dep_time sched_dep_time dep_delay arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>
 1  2013     1     1      517            515         2      830
 2  2013     1     1      533            529         4      850
 3  2013     1     1      542            540         2      923
 4  2013     1     1      544            545        -1     1004
 5  2013     1     1      554            600        -6      812
 6  2013     1     1      554            558        -4      740
 7  2013     1     1      555            600        -5      913
 8  2013     1     1      557            600        -3      709
 9  2013     1     1      557            600        -3      838
10  2013     1     1      558            600        -2      753
# ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
#   arr_delay <dbl>, carrier <chr>, flight <int>, tail_num <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>

#have a handful of variables to move to the start of the data frame.
select(flights, time_hour, air_time, everything())

# A tibble: 336,776 x 19
   time_hour           air_time  year month   day dep_time sched_dep_time
   <dttm>                 <dbl> <int> <int> <int>    <int>          <int>
 1 2013-01-01 05:00:00      227  2013     1     1      517            515
 2 2013-01-01 05:00:00      227  2013     1     1      533            529
 3 2013-01-01 05:00:00      160  2013     1     1      542            540
 4 2013-01-01 05:00:00      183  2013     1     1      544            545
 5 2013-01-01 06:00:00      116  2013     1     1      554            600
 6 2013-01-01 05:00:00      150  2013     1     1      554            558
 7 2013-01-01 06:00:00      158  2013     1     1      555            600
 8 2013-01-01 06:00:00       53  2013     1     1      557            600
 9 2013-01-01 06:00:00      140  2013     1     1      557            600
10 2013-01-01 06:00:00      138  2013     1     1      558            600
# ... with 336,766 more rows, and 12 more variables: dep_delay <dbl>,
#   arr_time <int>, sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
#   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, distance <dbl>,
#   hour <dbl>, minute <dbl>

## Add new variables with mutate()
flights_sml <- select(flights, 
  year:day, 
  ends_with("delay"), 
  distance, 
  air_time
)

mutate(flights_sml,
  gain = dep_delay - arr_delay,
  hours = air_time / 60,
  gain_per_hour = gain / hours
)

# A tibble: 336,776 x 10
    year month   day dep_delay arr_delay distance air_time  gain hours
   <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl> <dbl> <dbl>
 1  2013     1     1         2        11     1400      227    -9 3.78 
 2  2013     1     1         4        20     1416      227   -16 3.78 
 3  2013     1     1         2        33     1089      160   -31 2.67 
 4  2013     1     1        -1       -18     1576      183    17 3.05 
 5  2013     1     1        -6       -25      762      116    19 1.93 
 6  2013     1     1        -4        12      719      150   -16 2.5  
 7  2013     1     1        -5        19     1065      158   -24 2.63 
 8  2013     1     1        -3       -14      229       53    11 0.883
 9  2013     1     1        -3        -8      944      140     5 2.33 
10  2013     1     1        -2         8      733      138   -10 2.3  
# ... with 336,766 more rows, and 1 more variable: gain_per_hour <dbl>

#only want to keep the new variables
transmute(flights,
  gain = dep_delay - arr_delay,
  hours = air_time / 60,
  gain_per_hour = gain / hours
)

# A tibble: 336,776 x 3
    gain hours gain_per_hour
   <dbl> <dbl>         <dbl>
 1    -9 3.78          -2.38
 2   -16 3.78          -4.23
 3   -31 2.67         -11.6 
 4    17 3.05           5.57
 5    19 1.93           9.83
 6   -16 2.5           -6.4 
 7   -24 2.63          -9.11
 8    11 0.883         12.5 
 9     5 2.33           2.14
10   -10 2.3           -4.35
# ... with 336,766 more rows

#Arithmetic operators: +, -, *, /, ^. These are all vectorised, using the so called “recycling rules”. If one parameter is shorter than the other, it will be automatically extended to be the same length. This is most useful when one of the arguments is a single number: air_time / 60, hours * 60 + minute, etc.

#Arithmetic operators are also useful in conjunction with the aggregate functions you’ll learn about later. For example, x / sum(x) calculates the proportion of a total, and y - mean(y) computes the difference from the mean.

#Modular arithmetic: %/% (integer division) and %% (remainder), where x == y * (x %/% y) + (x %% y). Modular arithmetic is a handy tool because it allows you to break integers up into pieces. For example, in the flights dataset, you can compute hour and minute from dep_time with:


transmute(flights,
  dep_time,
  hour = dep_time %/% 100,
  minute = dep_time %% 100
)

# A tibble: 336,776 x 3
   dep_time  hour minute
      <int> <dbl>  <dbl>
 1      517     5     17
 2      533     5     33
 3      542     5     42
 4      544     5     44
 5      554     5     54
 6      554     5     54
 7      555     5     55
 8      557     5     57
 9      557     5     57
10      558     5     58
# ... with 336,766 more rows

#Offsets: lead() and lag() allow you to refer to leading or lagging values. This allows you to compute running differences (e.g. x - lag(x)) or find when values change (x != lag(x)). They are most useful in conjunction with group_by(), which you’ll learn about shortly.
(x <- 1:10)

 [1]  1  2  3  4  5  6  7  8  9 10

lag(x)

 [1] NA  1  2  3  4  5  6  7  8  9

lead(x)

 [1]  2  3  4  5  6  7  8  9 10 NA

 [1]  1  2  3  4  5  6  7  8  9 10

cumsum(x)

 [1]  1  3  6 10 15 21 28 36 45 55

cummean(x)

 [1] 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5

#Cumulative and rolling aggregates: R provides functions for running sums, products, mins and maxes: cumsum(), cumprod(), cummin(), cummax(); and dplyr provides cummean() for cumulative means

#ranking
y <- c(1, 2, 2, NA, 3, 4)
min_rank(y)

[1]  1  2  2 NA  4  5

min_rank(desc(y))

[1]  5  3  3 NA  2  1

row_number(y)

[1]  1  2  3 NA  4  5

dense_rank(y)

[1]  1  2  2 NA  3  4

percent_rank(y)

[1] 0.00 0.25 0.25   NA 0.75 1.00

cume_dist(y)

[1] 0.2 0.6 0.6  NA 0.8 1.0

##Summaries 
# collapses a data frame to a single row

summarise(flights, delay = mean(dep_delay, na.rm = TRUE))

# A tibble: 1 x 1
  delay
  <dbl>
1  12.6

#remove the NA


by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))

# A tibble: 365 x 4
# Groups:   year, month [12]
    year month   day delay
   <int> <int> <int> <dbl>
 1  2013     1     1 11.5 
 2  2013     1     2 13.9 
 3  2013     1     3 11.0 
 4  2013     1     4  8.95
 5  2013     1     5  5.73
 6  2013     1     6  7.15
 7  2013     1     7  5.42
 8  2013     1     8  2.55
 9  2013     1     9  2.28
10  2013     1    10  2.84
# ... with 355 more rows

## %>% Pipe to combine multiple operations

by_dest <- group_by(flights, dest)
#Group flights by destination.

delay <- summarise(by_dest,
  count = n(),
  dist = mean(distance, na.rm = TRUE),
  delay = mean(arr_delay, na.rm = TRUE)
)
#Summarise to compute distance, average delay, and number of flights.

delay <- filter(delay, count > 20, dest != "HNL")
#Filter to remove noisy points and Honolulu airport, which is almost twice as far away as the next closest airport


delays <- flights %>% 
  group_by(dest) %>% 
  summarise(
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>% 
  filter(count > 20, dest != "HNL")


# delays increase with distance up to ~750 miles and then decrease. Maybe as flights get longer there's more ability to make up delays in the air
#ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
#  geom_point(aes(size = count), alpha = 1/3) +
#  geom_smooth(se = FALSE)

#remove the cancel flight(NA)
not_cancelled <- flights %>% 
  filter(!is.na(dep_delay), !is.na(arr_delay))

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(mean = mean(dep_delay))

# A tibble: 365 x 4
# Groups:   year, month [12]
    year month   day  mean
   <int> <int> <int> <dbl>
 1  2013     1     1 11.4 
 2  2013     1     2 13.7 
 3  2013     1     3 10.9 
 4  2013     1     4  8.97
 5  2013     1     5  5.73
 6  2013     1     6  7.15
 7  2013     1     7  5.42
 8  2013     1     8  2.56
 9  2013     1     9  2.30
10  2013     1    10  2.84
# ... with 355 more rows

#Whenever you do any aggregation, it’s always a good idea to include either a count (n()), or a count of non-missing values (sum(!is.na(x))). That way you can check that you’re not drawing conclusions based on very small amounts of data

# The planes (identified by their tail number) that have the highest average delays
delays <- not_cancelled %>% 
  group_by(tailnum) %>% 
  summarise(
    delay = mean(arr_delay)
  )

#ggplot(data = delays, mapping = aes(x = delay)) + 
#  geom_freqpoly(binwidth = 10)

delays <- not_cancelled %>% 
  group_by(tailnum) %>% 
  summarise(
    delay = mean(arr_delay, na.rm = TRUE),
    n = n()
  )

#ggplot(data = delays, mapping = aes(x = n, y = delay)) + 
#  geom_point(alpha = 1/10)


delays %>% 
  filter(n > 25) %>% 
  ggplot(mapping = aes(x = n, y = delay)) + 
    geom_point(alpha = 1/10)

# Convert to a tibble so it prints nicely
batting <- as_tibble(Lahman::Batting)

batters <- batting %>% 
  group_by(playerID) %>% 
  summarise(
    ba = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE),
    ab = sum(AB, na.rm = TRUE)
  )

batters %>% 
  filter(ab > 100) %>% 
  ggplot(mapping = aes(x = ab, y = ba)) +
    geom_point() + 
    geom_smooth(se = FALSE)

batters %>% 
  arrange(desc(ba))

# A tibble: 19,428 x 3
   playerID     ba    ab
   <chr>     <dbl> <int>
 1 abramge01     1     1
 2 alberan01     1     1
 3 allarko01     1     1
 4 banisje01     1     1
 5 bartocl01     1     1
 6 bassdo01      1     1
 7 birasst01     1     2
 8 bruneju01     1     1
 9 burnscb01     1     1
10 cammaer01     1     1
# ... with 19,418 more rows

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    avg_delay1 = mean(arr_delay),
    avg_delay2 = mean(arr_delay[arr_delay > 0]) # the average positive delay
  )

# A tibble: 365 x 5
# Groups:   year, month [12]
    year month   day avg_delay1 avg_delay2
   <int> <int> <int>      <dbl>      <dbl>
 1  2013     1     1     12.7         32.5
 2  2013     1     2     12.7         32.0
 3  2013     1     3      5.73        27.7
 4  2013     1     4     -1.93        28.3
 5  2013     1     5     -1.53        22.6
 6  2013     1     6      4.24        24.4
 7  2013     1     7     -4.95        27.8
 8  2013     1     8     -3.23        20.8
 9  2013     1     9     -0.264       25.6
10  2013     1    10     -5.90        27.3
# ... with 355 more rows

# Why is distance to some destinations more variable than to others?
not_cancelled %>% 
  group_by(dest) %>% 
  summarise(distance_sd = sd(distance)) %>% 
  arrange(desc(distance_sd))

# A tibble: 104 x 2
   dest  distance_sd
   <chr>       <dbl>
 1 EGE         10.5 
 2 SAN         10.4 
 3 SFO         10.2 
 4 HNL         10.0 
 5 SEA          9.98
 6 LAS          9.91
 7 PDX          9.87
 8 PHX          9.86
 9 LAX          9.66
10 IND          9.46
# ... with 94 more rows

# When do the first and last flights leave each day?
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    first = min(dep_time),
    last = max(dep_time)
  )

# A tibble: 365 x 5
# Groups:   year, month [12]
    year month   day first  last
   <int> <int> <int> <int> <int>
 1  2013     1     1   517  2356
 2  2013     1     2    42  2354
 3  2013     1     3    32  2349
 4  2013     1     4    25  2358
 5  2013     1     5    14  2357
 6  2013     1     6    16  2355
 7  2013     1     7    49  2359
 8  2013     1     8   454  2351
 9  2013     1     9     2  2252
10  2013     1    10     3  2320
# ... with 355 more rows

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    first_dep = first(dep_time), 
    last_dep = last(dep_time)
  )

# A tibble: 365 x 5
# Groups:   year, month [12]
    year month   day first_dep last_dep
   <int> <int> <int>     <int>    <int>
 1  2013     1     1       517     2356
 2  2013     1     2        42     2354
 3  2013     1     3        32     2349
 4  2013     1     4        25     2358
 5  2013     1     5        14     2357
 6  2013     1     6        16     2355
 7  2013     1     7        49     2359
 8  2013     1     8       454     2351
 9  2013     1     9         2     2252
10  2013     1    10         3     2320
# ... with 355 more rows

not_cancelled %>% 
  group_by(year, month, day) %>% 
  mutate(r = min_rank(desc(dep_time))) %>% 
  filter(r %in% range(r))

# A tibble: 770 x 20
# Groups:   year, month, day [365]
    year month   day dep_time sched_dep_time dep_delay arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>
 1  2013     1     1      517            515         2      830
 2  2013     1     1     2356           2359        -3      425
 3  2013     1     2       42           2359        43      518
 4  2013     1     2     2354           2359        -5      413
 5  2013     1     3       32           2359        33      504
 6  2013     1     3     2349           2359       -10      434
 7  2013     1     4       25           2359        26      505
 8  2013     1     4     2358           2359        -1      429
 9  2013     1     4     2358           2359        -1      436
10  2013     1     5       14           2359        15      503
# ... with 760 more rows, and 13 more variables: sched_arr_time <int>,
#   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>, r <int>

# Which destinations have the most carriers?
not_cancelled %>% 
  group_by(dest) %>% 
  summarise(carriers = n_distinct(carrier)) %>% 
  arrange(desc(carriers))

# A tibble: 104 x 2
   dest  carriers
   <chr>    <int>
 1 ATL          7
 2 BOS          7
 3 CLT          7
 4 ORD          7
 5 TPA          7
 6 AUS          6
 7 DCA          6
 8 DTW          6
 9 IAD          6
10 MSP          6
# ... with 94 more rows

not_cancelled %>% 
  count(dest)

# A tibble: 104 x 2
   dest      n
   <chr> <int>
 1 ABQ     254
 2 ACK     264
 3 ALB     418
 4 ANC       8
 5 ATL   16837
 6 AUS    2411
 7 AVL     261
 8 BDL     412
 9 BGR     358
10 BHM     269
# ... with 94 more rows

# “count” (sum) the total number of miles a plane flew
not_cancelled %>% 
  count(tailnum, wt = distance)

# A tibble: 4,037 x 2
   tailnum      n
   <chr>    <dbl>
 1 D942DN    3418
 2 N0EGMQ  239143
 3 N10156  109664
 4 N102UW   25722
 5 N103US   24619
 6 N104UW   24616
 7 N10575  139903
 8 N105UW   23618
 9 N107US   21677
10 N108UW   32070
# ... with 4,027 more rows

# How many flights left before 5am? (these usually indicate delayed flights from the previous day)
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(n_early = sum(dep_time < 500))

# A tibble: 365 x 4
# Groups:   year, month [12]
    year month   day n_early
   <int> <int> <int>   <int>
 1  2013     1     1       0
 2  2013     1     2       3
 3  2013     1     3       4
 4  2013     1     4       3
 5  2013     1     5       3
 6  2013     1     6       2
 7  2013     1     7       2
 8  2013     1     8       1
 9  2013     1     9       3
10  2013     1    10       3
# ... with 355 more rows

# What proportion of flights are delayed by more than an hour?
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(hour_perc = mean(arr_delay > 60))

# A tibble: 365 x 4
# Groups:   year, month [12]
    year month   day hour_perc
   <int> <int> <int>     <dbl>
 1  2013     1     1    0.0722
 2  2013     1     2    0.0851
 3  2013     1     3    0.0567
 4  2013     1     4    0.0396
 5  2013     1     5    0.0349
 6  2013     1     6    0.0470
 7  2013     1     7    0.0333
 8  2013     1     8    0.0213
 9  2013     1     9    0.0202
10  2013     1    10    0.0183
# ... with 355 more rows

daily <- group_by(flights, year, month, day)
(per_day   <- summarise(daily, flights = n()))

# A tibble: 365 x 4
# Groups:   year, month [12]
    year month   day flights
   <int> <int> <int>   <int>
 1  2013     1     1     842
 2  2013     1     2     943
 3  2013     1     3     914
 4  2013     1     4     915
 5  2013     1     5     720
 6  2013     1     6     832
 7  2013     1     7     933
 8  2013     1     8     899
 9  2013     1     9     902
10  2013     1    10     932
# ... with 355 more rows

(per_month <- summarise(per_day, flights = sum(flights)))

# A tibble: 12 x 3
# Groups:   year [1]
    year month flights
   <int> <int>   <int>
 1  2013     1   27004
 2  2013     2   24951
 3  2013     3   28834
 4  2013     4   28330
 5  2013     5   28796
 6  2013     6   28243
 7  2013     7   29425
 8  2013     8   29327
 9  2013     9   27574
10  2013    10   28889
11  2013    11   27268
12  2013    12   28135

(per_year  <- summarise(per_month, flights = sum(flights)))

# A tibble: 1 x 2
   year flights
  <int>   <int>
1  2013  336776

##ungroup
daily %>% 
  ungroup() %>%             # no longer grouped by date
  summarise(flights = n())  # all flights

# A tibble: 1 x 1
  flights
    <int>
1  336776

#Find the worst members of each group

flights_sml %>% 
  group_by(year, month, day) %>%
  filter(rank(desc(arr_delay)) < 10)

# A tibble: 3,306 x 7
# Groups:   year, month, day [365]
    year month   day dep_delay arr_delay distance air_time
   <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl>
 1  2013     1     1       853       851      184       41
 2  2013     1     1       290       338     1134      213
 3  2013     1     1       260       263      266       46
 4  2013     1     1       157       174      213       60
 5  2013     1     1       216       222      708      121
 6  2013     1     1       255       250      589      115
 7  2013     1     1       285       246     1085      146
 8  2013     1     1       192       191      199       44
 9  2013     1     1       379       456     1092      222
10  2013     1     2       224       207      550       94
# ... with 3,296 more rows

#Find all groups bigger than a threshold

popular_dests <- flights %>% 
  group_by(dest) %>% 
  filter(n() > 365)
popular_dests

# A tibble: 332,577 x 19
# Groups:   dest [77]
    year month   day dep_time sched_dep_time dep_delay arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>
 1  2013     1     1      517            515         2      830
 2  2013     1     1      533            529         4      850
 3  2013     1     1      542            540         2      923
 4  2013     1     1      544            545        -1     1004
 5  2013     1     1      554            600        -6      812
 6  2013     1     1      554            558        -4      740
 7  2013     1     1      555            600        -5      913
 8  2013     1     1      557            600        -3      709
 9  2013     1     1      557            600        -3      838
10  2013     1     1      558            600        -2      753
# ... with 332,567 more rows, and 12 more variables: sched_arr_time <int>,
#   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>

#Standardise to compute per group metrics
popular_dests %>% 
  filter(arr_delay > 0) %>% 
  mutate(prop_delay = arr_delay / sum(arr_delay)) %>% 
  select(year:day, dest, arr_delay, prop_delay)

# A tibble: 131,106 x 6
# Groups:   dest [77]
    year month   day dest  arr_delay prop_delay
   <int> <int> <int> <chr>     <dbl>      <dbl>
 1  2013     1     1 IAH          11  0.000111 
 2  2013     1     1 IAH          20  0.000201 
 3  2013     1     1 MIA          33  0.000235 
 4  2013     1     1 ORD          12  0.0000424
 5  2013     1     1 FLL          19  0.0000938
 6  2013     1     1 ORD           8  0.0000283
 7  2013     1     1 LAX           7  0.0000344
 8  2013     1     1 DFW          31  0.000282 
 9  2013     1     1 ATL          12  0.0000400
10  2013     1     1 DTW          16  0.000116 
# ... with 131,096 more rows

Ch12

Tidy data

library(tidyverse)

#There are three interrelated rules which make a dataset tidy:
#1.Each variable must have its own column.
#2.Each observation must have its own row.
#3.Each value must have its own cell.

table1 #tidy table

# A tibble: 6 x 4
  country      year  cases population
  <chr>       <int>  <int>      <int>
1 Afghanistan  1999    745   19987071
2 Afghanistan  2000   2666   20595360
3 Brazil       1999  37737  172006362
4 Brazil       2000  80488  174504898
5 China        1999 212258 1272915272
6 China        2000 213766 1280428583

table2

# A tibble: 12 x 4
   country      year type            count
   <chr>       <int> <chr>           <int>
 1 Afghanistan  1999 cases             745
 2 Afghanistan  1999 population   19987071
 3 Afghanistan  2000 cases            2666
 4 Afghanistan  2000 population   20595360
 5 Brazil       1999 cases           37737
 6 Brazil       1999 population  172006362
 7 Brazil       2000 cases           80488
 8 Brazil       2000 population  174504898
 9 China        1999 cases          212258
10 China        1999 population 1272915272
11 China        2000 cases          213766
12 China        2000 population 1280428583

table3

# A tibble: 6 x 3
  country      year rate             
* <chr>       <int> <chr>            
1 Afghanistan  1999 745/19987071     
2 Afghanistan  2000 2666/20595360    
3 Brazil       1999 37737/172006362  
4 Brazil       2000 80488/174504898  
5 China        1999 212258/1272915272
6 China        2000 213766/1280428583

table4a

# A tibble: 3 x 3
  country     `1999` `2000`
* <chr>        <int>  <int>
1 Afghanistan    745   2666
2 Brazil       37737  80488
3 China       212258 213766

table4b

# A tibble: 3 x 3
  country         `1999`     `2000`
* <chr>            <int>      <int>
1 Afghanistan   19987071   20595360
2 Brazil       172006362  174504898
3 China       1272915272 1280428583

# Compute rate per 10,000
table1 %>% 
  mutate(rate = cases / population * 10000)

# A tibble: 6 x 5
  country      year  cases population  rate
  <chr>       <int>  <int>      <int> <dbl>
1 Afghanistan  1999    745   19987071 0.373
2 Afghanistan  2000   2666   20595360 1.29 
3 Brazil       1999  37737  172006362 2.19 
4 Brazil       2000  80488  174504898 4.61 
5 China        1999 212258 1272915272 1.67 
6 China        2000 213766 1280428583 1.67

# Compute cases per year
table1 %>% 
  count(year, wt = cases)

# A tibble: 2 x 2
   year      n
  <int>  <int>
1  1999 250740
2  2000 296920

# Visualise changes over time
#library(ggplot2)
#ggplot(table1, aes(year, cases)) + 
#  geom_line(aes(group = country), colour = "grey50") + 
#  geom_point(aes(colour = country))


##Gather
#some of the column names are not names of variables, but values of a variable. Take table4a: the column names 1999 and 2000 represent values of the year variable, and each row represents two observations, not one
table4a

# A tibble: 3 x 3
  country     `1999` `2000`
* <chr>        <int>  <int>
1 Afghanistan    745   2666
2 Brazil       37737  80488
3 China       212258 213766

#The name of the variable whose values form the column names. I call that the key, and here it is year.

#The name of the variable whose values are spread over the cells. I call that value, and here it’s the number of cases.
table4a %>% 
  gather(`1999`, `2000`, key = "year", value = "cases")

# A tibble: 6 x 3
  country     year   cases
  <chr>       <chr>  <int>
1 Afghanistan 1999     745
2 Brazil      1999   37737
3 China       1999  212258
4 Afghanistan 2000    2666
5 Brazil      2000   80488
6 China       2000  213766

#the variable stored in the cell values:
table4b %>% 
  gather(`1999`, `2000`, key = "year", value = "population")

# A tibble: 6 x 3
  country     year  population
  <chr>       <chr>      <int>
1 Afghanistan 1999    19987071
2 Brazil      1999   172006362
3 China       1999  1272915272
4 Afghanistan 2000    20595360
5 Brazil      2000   174504898
6 China       2000  1280428583

#To combine the tidied versions of table4a and table4b into a single tibble, we need to use dplyr::left_join()

tidy4a <- table4a %>% 
  gather(`1999`, `2000`, key = "year", value = "cases")
tidy4b <- table4b %>% 
  gather(`1999`, `2000`, key = "year", value = "population")
left_join(tidy4a, tidy4b)

# A tibble: 6 x 4
  country     year   cases population
  <chr>       <chr>  <int>      <int>
1 Afghanistan 1999     745   19987071
2 Brazil      1999   37737  172006362
3 China       1999  212258 1272915272
4 Afghanistan 2000    2666   20595360
5 Brazil      2000   80488  174504898
6 China       2000  213766 1280428583

##Spreading
#Spreading is the opposite of gathering. You use it when an observation is scattered across multiple rows
table2

# A tibble: 12 x 4
   country      year type            count
   <chr>       <int> <chr>           <int>
 1 Afghanistan  1999 cases             745
 2 Afghanistan  1999 population   19987071
 3 Afghanistan  2000 cases            2666
 4 Afghanistan  2000 population   20595360
 5 Brazil       1999 cases           37737
 6 Brazil       1999 population  172006362
 7 Brazil       2000 cases           80488
 8 Brazil       2000 population  174504898
 9 China        1999 cases          212258
10 China        1999 population 1272915272
11 China        2000 cases          213766
12 China        2000 population 1280428583

#The column that contains variable names, the key column. Here, it’s type.
#The column that contains values from multiple variables, the value column. Here it’s count.

table2 %>%
    spread(key = type, value = count)

# A tibble: 6 x 4
  country      year  cases population
  <chr>       <int>  <int>      <int>
1 Afghanistan  1999    745   19987071
2 Afghanistan  2000   2666   20595360
3 Brazil       1999  37737  172006362
4 Brazil       2000  80488  174504898
5 China        1999 212258 1272915272
6 China        2000 213766 1280428583

#gather() makes wide tables narrower and longer; spread() makes long tables shorter and wider.

##Separarte
#pulls apart one column into multiple columns, by splitting wherever a separator character appears

table3

# A tibble: 6 x 3
  country      year rate             
* <chr>       <int> <chr>            
1 Afghanistan  1999 745/19987071     
2 Afghanistan  2000 2666/20595360    
3 Brazil       1999 37737/172006362  
4 Brazil       2000 80488/174504898  
5 China        1999 212258/1272915272
6 China        2000 213766/1280428583

#The rate column contains both cases and population variables, and we need to split it into two variables
table3 %>% 
  separate(rate, into = c("cases", "population"))

# A tibble: 6 x 4
  country      year cases  population
  <chr>       <int> <chr>  <chr>     
1 Afghanistan  1999 745    19987071  
2 Afghanistan  2000 2666   20595360  
3 Brazil       1999 37737  172006362 
4 Brazil       2000 80488  174504898 
5 China        1999 212258 1272915272
6 China        2000 213766 1280428583

#separate() will split values wherever it sees a non-alphanumeric character

table3 %>% 
  separate(rate, into = c("cases", "population"), sep = "/")

# A tibble: 6 x 4
  country      year cases  population
  <chr>       <int> <chr>  <chr>     
1 Afghanistan  1999 745    19987071  
2 Afghanistan  2000 2666   20595360  
3 Brazil       1999 37737  172006362 
4 Brazil       2000 80488  174504898 
5 China        1999 212258 1272915272
6 China        2000 213766 1280428583

#If you wish to use a specific character to separate a column, you can pass the character to the sep argument of separate().

table3 %>% 
  separate(rate, into = c("cases", "population"), convert = TRUE)

# A tibble: 6 x 4
  country      year  cases population
  <chr>       <int>  <int>      <int>
1 Afghanistan  1999    745   19987071
2 Afghanistan  2000   2666   20595360
3 Brazil       1999  37737  172006362
4 Brazil       2000  80488  174504898
5 China        1999 212258 1272915272
6 China        2000 213766 1280428583

#ask separate() to try and convert to better types using convert = TRUE

#pass a vector of integers to sep. separate() will interpret the integers as positions to split at. Positive values start at 1 on the far-left of the strings; negative value start at -1 on the far-right of the strings. When using integers to separate strings, the length of sep should be one less than the number of names in "into"

#Separate the last two digits of each year
table3 %>% 
  separate(year, into = c("century", "year"), sep = 2)

# A tibble: 6 x 4
  country     century year  rate             
  <chr>       <chr>   <chr> <chr>            
1 Afghanistan 19      99    745/19987071     
2 Afghanistan 20      00    2666/20595360    
3 Brazil      19      99    37737/172006362  
4 Brazil      20      00    80488/174504898  
5 China       19      99    212258/1272915272
6 China       20      00    213766/1280428583

#unite() is the inverse of separate(): it combines multiple columns into a single column

#use unite() to rejoin the century and year columns

tidyr::table5

# A tibble: 6 x 4
  country     century year  rate             
* <chr>       <chr>   <chr> <chr>            
1 Afghanistan 19      99    745/19987071     
2 Afghanistan 20      00    2666/20595360    
3 Brazil      19      99    37737/172006362  
4 Brazil      20      00    80488/174504898  
5 China       19      99    212258/1272915272
6 China       20      00    213766/1280428583

table5%>% 
  unite(new, century, year)

# A tibble: 6 x 3
  country     new   rate             
  <chr>       <chr> <chr>            
1 Afghanistan 19_99 745/19987071     
2 Afghanistan 20_00 2666/20595360    
3 Brazil      19_99 37737/172006362  
4 Brazil      20_00 80488/174504898  
5 China       19_99 212258/1272915272
6 China       20_00 213766/1280428583

#use the sep argument
table5 %>% 
  unite(new, century, year, sep = "")

# A tibble: 6 x 3
  country     new   rate             
  <chr>       <chr> <chr>            
1 Afghanistan 1999  745/19987071     
2 Afghanistan 2000  2666/20595360    
3 Brazil      1999  37737/172006362  
4 Brazil      2000  80488/174504898  
5 China       1999  212258/1272915272
6 China       2000  213766/1280428583

##Missing values

#Explicitly, i.e. flagged with NA.
#Implicitly, i.e. simply not present in the data.

stocks <- tibble(
  year   = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
  qtr    = c(   1,    2,    3,    4,    2,    3,    4),
  return = c(1.88, 0.59, 0.35,   NA, 0.92, 0.17, 2.66)
)

#make the implicit missing value explicit by putting years in the columns
stocks %>% 
  spread(year, return)

# A tibble: 4 x 3
    qtr `2015` `2016`
  <dbl>  <dbl>  <dbl>
1     1   1.88  NA   
2     2   0.59   0.92
3     3   0.35   0.17
4     4  NA      2.66

#set na.rm = TRUE in gather() to turn explicit missing values implicit
stocks %>% 
  spread(year, return) %>% 
  gather(year, return, `2015`:`2016`, na.rm = TRUE)

# A tibble: 6 x 3
    qtr year  return
  <dbl> <chr>  <dbl>
1     1 2015    1.88
2     2 2015    0.59
3     3 2015    0.35
4     2 2016    0.92
5     3 2016    0.17
6     4 2016    2.66

stocks %>% 
  complete(year, qtr)

# A tibble: 8 x 3
   year   qtr return
  <dbl> <dbl>  <dbl>
1  2015     1   1.88
2  2015     2   0.59
3  2015     3   0.35
4  2015     4  NA   
5  2016     1  NA   
6  2016     2   0.92
7  2016     3   0.17
8  2016     4   2.66

#complete() takes a set of columns, and finds all unique combinations. It then ensures the original dataset contains all those values, filling in explicit NAs where necessary


#when a data source has primarily been used for data entry, missing values indicate that the previous value should be carried forward

treatment <- tribble(
  ~ person,           ~ treatment, ~response,
  "Derrick Whitmore", 1,           7,
  NA,                 2,           10,
  NA,                 3,           9,
  "Katherine Burke",  1,           4
)

#takes a set of columns where you want missing values to be replaced by the most recent non-missing value

treatment %>% 
  fill(person)

# A tibble: 4 x 3
  person           treatment response
  <chr>                <dbl>    <dbl>
1 Derrick Whitmore         1        7
2 Derrick Whitmore         2       10
3 Derrick Whitmore         3        9
4 Katherine Burke          1        4

Ch20

Vectors

library(tidyverse)

#2 properties of vectors: type and length
#Type
typeof(letters)

[1] "character"

typeof(1:10)

[1] "integer"

#length

x <- list("a", "b", 1:10)
length(x)

[1] 3

#Logical vectors are the simplest type of atomic vector because they can take only three possible values: FALSE, TRUE, and NA
#comparisons
1:10 %% 3 == 0

 [1] FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE

#create by hand
c(TRUE, TRUE, FALSE, NA)

[1]  TRUE  TRUE FALSE    NA

#Integer and double vectors are known collectively as numeric vectors
#To make an integer, place an L after the number

typeof(1)

[1] "double"

typeof(1L)

[1] "integer"

1.5L

[1] 1.5

#Doubles are approximations.
x <- sqrt(2) ^ 2
x

[1] 2

x - 2

[1] 4.440892e-16

# when working with floating point numbers: most calculations include some approximation error. Instead of comparing floating point numbers using ==, you should use dplyr::near() which allows for some numerical tolerance

#Integers have one special value: NA, while doubles have four: NA, NaN, Inf and -Inf. All three special values NaN, Inf and -Inf can arise during division

c(-1, 0, 1) / 0

[1] -Inf  NaN  Inf

#Avoid using == to check for these other special values. Instead use the helper functions is.finite(), is.infinite(), and is.nan()


#Character 

#each unique string is only stored in memory once, and every use of the string points to that representation. This reduces the amount of memory needed by duplicated strings. You can see this behaviour in practice with pryr::object_size()

x <- "This is a reasonably long string."
pryr::object_size(x)

152 B

y <- rep(x, 1000)
pryr::object_size(y)

8.14 kB

#y doesn’t take up 1,000x as much memory as x, because each element of y is just a pointer to that same string. A pointer is 8 bytes, so 1000 pointers to a 136 B string is 8 * 1000 + 136 = 8.13 kB

#Missing Value
#Each type of atomic vector has its own missing value

NA            # logical

[1] NA

NA_integer_   # integer

[1] NA

NA_real_      # double

[1] NA

NA_character_ # character

[1] NA

#Coercion
#There are two ways to convert, or coerce, one type of vector to another:

#1 Explicit coercion 
# as.logical(), as.integer(), as.double(), or as.character()

#2 Implicit coercion
#EX: using a logical vector in a numeric context
#TRUE=1, FALSE=0
x <- sample(20, 100, replace = TRUE)
y <- x > 10
sum(y)  # how many are greater than 10

[1] 48

mean(y) # proportion are greater than 10

[1] 0.48

#when a vector containing multiple types with c(): the most complex type always wins.

typeof(c(TRUE, 1L))

[1] "integer"

typeof(c(1L, 1.5))

[1] "double"

typeof(c(1.5, "a"))

[1] "character"

#Each predicate also comes with a “scalar” version, like is_scalar_atomic(), which checks that the length is 1. This is useful, for example, if you want to check that an argument to your function is a single logical value.

#Vector recycling: implicitly coerce the length of vectors, because the shorter vector is repeated, or recycled, to the same length as the longer vector.

# Vectorised, meaning that they will operate on a vector of numbers
sample(10) + 100

 [1] 109 102 106 110 105 107 108 104 101 103

runif(10) > 0.5

 [1] FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE

#R will expand the shortest vector to the same length as the longest, so called recycling
1:10 + 1:2

 [1]  2  4  4  6  6  8  8 10 10 12

1:10 + 1:3

 [1]  2  4  6  5  7  9  8 10 12 11

#While vector recycling can be used to create very succinct, clever code, it can also silently conceal problems. For this reason, the vectorised functions in tidyverse will throw errors when you recycle anything other than a scalar. If you do want to recycle, you’ll need to do it yourself with rep():

tibble(x = 1:4, y = rep(1:2, 2))

# A tibble: 4 x 2
      x     y
  <int> <int>
1     1     1
2     2     2
3     3     1
4     4     2

tibble(x = 1:4, y = rep(1:2, each = 2))

# A tibble: 4 x 2
      x     y
  <int> <int>
1     1     1
2     2     1
3     3     2
4     4     2

# Name vector during creation with c():

c(x = 1, y = 2, z = 4)

x y z 
1 2 4

#after the fact with purrr::set_names()
set_names(1:3, c("a", "b", "c"))

a b c 
1 2 3

##Subset

#1.A numeric vector containing only integers. The integers must either be all positive, all negative, or zero. Subsetting with positive integers keeps the elements at those positions:

x <- c("one", "two", "three", "four", "five")
x[c(3, 2, 5)]

[1] "three" "two"   "five"

x[c(1, 1, 5, 5, 5, 2)]

[1] "one"  "one"  "five" "five" "five" "two"

#Negative values drop the elements at the specified positions
x[c(-1, -3, -5)]

[1] "two"  "four"

#2.Subsetting with a logical vector keeps all values corresponding to a TRUE value. This is most often useful in conjunction with the comparison functions

x <- c(10, 3, NA, 5, 8, 1, NA)
# All non-missing values of x
x[!is.na(x)]

[1] 10  3  5  8  1

# All even (or missing!) values of x
x[x %% 2 == 0]

[1] 10 NA  8 NA

#3.If you have a named vector, you can subset it with a character vector

x <- c(abc = 1, def = 2, xyz = 5)
x[c("xyz", "def")]

xyz def 
  5   2

##20.5 Recursive vectors (lists)

#create a list 
x <- list(1, 2, 3)
x

[[1]]
[1] 1

[[2]]
[1] 2

[[3]]
[1] 3

str(x)

List of 3
 $ : num 1
 $ : num 2
 $ : num 3

x_named <- list(a = 1, b = 2, c = 3)
str(x_named)

List of 3
 $ a: num 1
 $ b: num 2
 $ c: num 3

#list can contain a mix of objects
y <- list("a", 1L, 1.5, TRUE)
str(y)

List of 4
 $ : chr "a"
 $ : int 1
 $ : num 1.5
 $ : logi TRUE

#or even cotain other list 
z <- list(list(1, 2), list(3, 4))
str(z)

List of 2
 $ :List of 2
  ..$ : num 1
  ..$ : num 2
 $ :List of 2
  ..$ : num 3
  ..$ : num 4

x1 <- list(c(1, 2), c(3, 4))
x2 <- list(list(1, 2), list(3, 4))
x3 <- list(1, list(2, list(3)))


a <- list(a = 1:3, b = "a string", c = pi, d = list(-1, -5))
str(a[1:2])

List of 2
 $ a: int [1:3] 1 2 3
 $ b: chr "a string"

str(a[4])  #[ extracts a sub-list. The result will always be a list

List of 1
 $ d:List of 2
  ..$ : num -1
  ..$ : num -5

#[[ extracts a single component from a list. It removes a level of hierarchy from the list.
str(a[[1]])

 int [1:3] 1 2 3

str(a[[4]])

List of 2
 $ : num -1
 $ : num -5

# $ has same function without quote
a$a

[1] 1 2 3

a[["a"]]

[1] 1 2 3

##Attributes
x <- 1:10
attr(x, "greeting")

NULL

attr(x, "greeting") <- "Hi!"
attr(x, "farewell") <- "Bye!"
attributes(x)

$greeting
[1] "Hi!"

$farewell
[1] "Bye!"

##Factors

# Factors are designed to represent categorical data that can take a fixed set of possible values. Factors are built on top of integers, and have a levels attribute


x <- factor(c("ab", "cd", "ab"), levels = c("ab", "cd", "ef"))
typeof(x)

[1] "integer"

attributes(x)

$levels
[1] "ab" "cd" "ef"

$class
[1] "factor"

#Dates in R are numeric vectors that represent the number of days since 1 January 1970

x <- as.Date("1971-01-01")
unclass(x)

[1] 365

typeof(x)

[1] "double"

attributes(x)

$class
[1] "Date"

#Date-times are numeric vectors with class POSIXct that represent the number of seconds since 1 January 1970. (In case you were wondering, “POSIXct” stands for “Portable Operating System Interface”, calendar time.)
x <- lubridate::ymd_hm("1970-01-01 01:00")
unclass(x)

[1] 3600
attr(,"tzone")
[1] "UTC"

typeof(x)

[1] "double"

attributes(x)

$class
[1] "POSIXct" "POSIXt" 

$tzone
[1] "UTC"

#The tzone attribute is optional. It controls how the time is printed, not what absolute time it refers to
attr(x, "tzone") <- "US/Pacific"
x

[1] "1969-12-31 17:00:00 PST"

attr(x, "tzone") <- "US/Eastern"
x

[1] "1969-12-31 20:00:00 EST"

# Another type of date-times called POSIXlt. These are built on top of named lists
y <- as.POSIXlt(x)
typeof(y)

[1] "list"

attributes(y)

$names
 [1] "sec"    "min"    "hour"   "mday"   "mon"    "year"   "wday"  
 [8] "yday"   "isdst"  "zone"   "gmtoff"

$class
[1] "POSIXlt" "POSIXt" 

$tzone
[1] "US/Eastern" "EST"        "EDT"

#POSIXct’s are always easier to work with, so if you find you have a POSIXlt, you should always convert it to a regular data time lubridate::as_date_time()


#Tibbles are augmented lists: they have class “tbl_df” + “tbl” + “data.frame”, and names (column) and row.names attributes
tb <- tibble::tibble(x = 1:5, y = 5:1)
typeof(tb)

[1] "list"

attributes(tb)

$names
[1] "x" "y"

$row.names
[1] 1 2 3 4 5

$class
[1] "tbl_df"     "tbl"        "data.frame"

#The difference between a tibble and a list is that all the elements of a data frame must be vectors with the same length. All functions that work with tibbles enforce this constraint.

#Traditional data.frames have a very similar structure
df <- data.frame(x = 1:5, y = 5:1)
typeof(df)

[1] "list"

attributes(df)

$names
[1] "x" "y"

$class
[1] "data.frame"

$row.names
[1] 1 2 3 4 5

#The class of tibble includes “data.frame” which means tibbles inherit the regular data frame behaviour by default

---
title: "EDA code Profolio Wk1-6"
output: 
  flexdashboard::flex_dashboard:
    orientation: columns
    source_code: embed
---

```{r setup }
library(flexdashboard)
```

YaRrr!
===================================== 
Column {.tabset}
-------------------------------------
### Ch 8
Chapter 8 Matrices and Dataframes

```{r echo = TRUE}

# Create a dataframe of boat sale data named bsale
bsale <- data.frame(name = c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"),
                    color = c("black", "green", "pink", "blue", "blue", 
                              "green", "green", "yellow", "black", "black"),
                    age = c(143, 53, 356, 23, 647, 24, 532, 43, 66, 86),
                    price = c(53, 87, 54, 66, 264, 32, 532, 58, 99, 132),
                    cost = c(52, 80, 20, 100, 189, 12, 520, 68, 80, 100),
                    stringsAsFactors = FALSE)   
                    # Avoid converting strings to factors!

# Explore the bsale dataset:
head(bsale)     # Show the first few rows
str(bsale)      # Show the structure of the data
View(bsale)     # Open the data in a new window
names(bsale)    # Show the names of the columns
nrow(bsale)     # Show how many rows are there in the data

# Calculating statistics from column vectors
mean(bsale$age)       # The mean age
table(bsale$color)    # How many boats were there of each color?
max(bsale$price)      # The maximum price

# Adding new columns
bsale$id <- 1:nrow(bsale)
bsale$age.decades <- bsale$age / 10
bsale$profit <- bsale$price - bsale$cost

# The mean price of green boats
with(bsale, mean(price[color == "green"]))

# The names of boats older than 100 years?
with(bsale, name[age > 100])

# The percent of black boats had a positive profit?
with(subset(bsale, color == "black"), mean(profit > 0))

# Save only the price and cost columns in a new dataframe
bsale.2 <- bsale[c("price", "cost")]

# Change the names of the columns to "p" and "c"
names(bsale.2) <- c("p", "c")

# Use Subset to create a dataframe called old.black.bsale containing only data from black boats older than 50 years
old.black.bsale <- subset(bsale, color == "black" & age > 50)


x <- 1:5
y <- 6:10
z <- 11:15

# Create a matrix where x, y and z are columns
cbind(x, y, z)

# Create a matrix where x, y and z are rows
rbind(x, y, z)

# Creating a matrix with numeric and character columns will make everything a character:

cbind(c(1, 2, 3, 4, 5),
      c("a", "b", "c", "d", "e"))


# Create a matrix of the integers 1:10, with 5 rows and 2 columns

matrix(data = 1:10,
       nrow = 5,
       ncol = 2)

# with 2 rows and 5 columns
matrix(data = 1:10,
       nrow = 2,
       ncol = 5)

# With 2 rows and 5 columns, but fill by row instead of columns
matrix(data = 1:10,
       nrow = 2,
       ncol = 5,
       byrow = TRUE)


# Create a dataframe of survey data

survey <- data.frame("index" = c(1, 2, 3, 4, 5),
                     "sex" = c("m", "m", "m", "f", "f"),
                     "age" = c(99, 46, 23, 54, 23))
survey

# Show me the structure of the survey dataframe
str(survey)

#R converted sex to a factor with only two possible levels

# Create a dataframe of survey data without factors
survey <- data.frame("index" = c(1, 2, 3, 4, 5),
                     "sex" = c("m", "m", "m", "f", "f"),
                     "age" = c(99, 46, 23, 54, 23),
                     stringsAsFactors = FALSE)

#check the structure if if there is factor
str(survey)

#To see a complete list of all the datasets included in the datasets package, run the code: library(help = "datasets")

library(help = "datasets")


##8.3 Matrix and dataframe function

# head() shows the first few rows
head(ChickWeight)
# tail() shows the last few rows
tail(ChickWeight)
# View() opens the entire dataframe in a new window
View(ChickWeight)
# Print summary statistics of ToothGrowth to the console
summary(ToothGrowth)

# Print additional information about ToothGrowth to the console
str(ToothGrowth)

##8.4 Dataframe column names

# The names of columns in the ToothGrowth dataframe
names(ToothGrowth)

# Return the len column of ToothGrowth
ToothGrowth$len

# The mean of the len column of ToothGrowth?
mean(ToothGrowth$len)

# Table of the supp column of ToothGrowth.
table(ToothGrowth$supp)

#The len AND supp columns of ToothGrowth
head(ToothGrowth[c("len", "supp")])

# Create a new dataframe called survey
survey <- data.frame("index" = c(1, 2, 3, 4, 5),
                     "age" = c(24, 25, 42, 56, 22))

survey

# Add a new column called sex to survey
survey$sex <- c("m", "m", "f", "f", "m")

#survey with new sex column
survey

#Change name of 1st column of df to "a":   names(df)[1] <- "a"

#Change name of 2nd column of df to "b": names(df)[2] <- "b"

# Change the name of the first column of survey to "participant.number"
names(survey)[1] <- "participant.number"
survey

# Change the column name from age to age.years
names(survey)[names(survey) == "age"] <- "years"
survey

##8.5 Slicing dataframes

# Return row 1   df[1, ]

# Return column 5   df[, 5]

# Rows 1:5 and column 2  df[1:5, 2]

# Show the rows 1-6 and column 1 of ToothGrowth
ToothGrowth[1:6, 1]

# Show the rows 1-3 and columns 1 and 3 of ToothGrowth
ToothGrowth[1:3, c(1,3)]

# Show the 1st row (and all columns) of ToothGrowth
ToothGrowth[1, ]


# Show the 2nd column (and all rows) of ToothGrowth
ToothGrowth[, 2]

# Create a new df with only the rows of ToothGrowth where supp equals VC
ToothGrowth.VC <- ToothGrowth[ToothGrowth$supp == "VC", ]


# Create a new df with only the rows of ToothGrowth where supp equals OJ and dose < 1
ToothGrowth.OJ.a <- ToothGrowth[ToothGrowth$supp == "OJ" &
                                ToothGrowth$dose < 1, ]




# Show rows of ToothGrowth where len < 20 AND supp == "OJ" AND dose >= 1
subset(x = ToothGrowth,
      subset = len < 20 &
               supp == "OJ" &
               dose >= 1)

# Get rows of ToothGrowth where len > 30 AND supp == "VC", but only return the len and dose columns
subset(x = ToothGrowth,
    subset = len > 30 & supp == "VC",
    select = c(len, dose))


#8.6 Combining slicing with functions

# The mean tooth length of Guinea pigs given OJ

# Step 1: Create a subsettted dataframe called oj

oj <- subset(x = ToothGrowth,
             subset = supp == "OJ")

# Step 2: Calculate the mean of the len column from the new subsetted dataset

mean(oj$len)

# Get the same solution using logical indexing

# Step 1: Create a subsettted dataframe called oj
oj <- ToothGrowth[ToothGrowth$supp == "OJ",]

# Step 2: Calculate the mean of the len column from the new subsetted dataset
mean(oj$len)

# Only referring to column vectors
mean(ToothGrowth$len[ToothGrowth$supp == "OJ"])


#The with() function helps to save you some typing when you are using multiple columns from a dataframe. Specifically, it allows you to specify a dataframe (or any other object in R) once at the beginning of a line – then, for every object you refer to in the code in that line, R will assume you’re referring to that object in an expression.

#create a dataframe called health
health <- data.frame("age" = c(32, 24, 43, 19, 43),
                     "height" = c(1.75, 1.65, 1.50, 1.92, 1.80),
                     "weight" = c(70, 65, 62, 79, 85))


#Add a new column called bmi which represents a person’s body mass index (BMI)
health$weight / health$height ^ 2

#use with function 
# Save typing by using with()
with(health, weight / height ^ 2)


```
    
### Ch 9
Chapter 9 Importing, saving and managing data

```{r echo = TRUE}

#print the current working directory
getwd()

#change the working directory to diferent path
#setwd(dir= )

#see all the objects defined in current workspace
ls()

# Create objects to save later
study1.df<-data.frame(id = 1:5, 
                        sex = c("m", "m", "f", "f", "m"), 
                        score = c(51, 20, 67, 52, 42))

score.by.sex<-aggregate(score ~ sex, 
                          FUN = mean, 
                          data = study1.df)

study1.htest<-t.test(score ~ sex, 
                       data = study1.df)



# Save two objects as a new .RData file in the data folder of my current working directory
#save(study1.df, score.by.sex, study1.htest,file = "data")
#To save all the objects in your workspace as a .RData file, use the "save.image()" function
#To load an .RData file, that is, to import all of the objects contained in the .RData file into your current workspace, use the load() function
#To remove objects from your workspace, use the "rm()" function


# Read a text file from the web
fromweb <- read.table(file = 'http://goo.gl/jTNf6P',
                      sep = '\t',
                      header = TRUE)

# Print the result
fromweb

a <- data.frame("sex" = c("m", "f", "m"),
                "age" = c(19, 43, 25),
                "favorite.movie" = c("Moon", "The Goonies", "Spice World"))
b <- mean(a$age)
c <- table(a$sex)

```
   
Exploratory Data Analysis with R
=====================================     
Column {.tabset}
-------------------------------------
### Ch4
Exploratory Data Analysis Checklist

```{r echo = TRUE}
library(readr)
ozone<- read_csv("C:/Users/L5M/Desktop/HU/EDA/EDA_R_project/US EPA data 2017.csv")
names(ozone) <- make.names(names(ozone))
#rewrite the names of the columns to remove any spaces

# check the number or columns and row
nrow(ozone)
ncol(ozone)

#examine the classes of each of the columns
str(ozone)

#Look at the top 6 rows of data
head(ozone)
#Look at the botom 6 rows of specific columns 
tail(ozone[, c(6:7, 10)])

#take a look at the Stat.Code variable to see what time measurements are recorded as being taken

table(ozone$State.Code)

library(dplyr)
#use filter function to filter dataset
filter(ozone,ozone$State.Code=="78" )

#select specific range of data
select(ozone, State.Name) %>% unique %>% nrow

#unique elements
unique(ozone$State.Name)
#statistic summary for specific variable
summary(ozone$Arithmetic.Mean)
quantile(ozone$Arithmetic.Mean)



```
   
   
R for Data Science
=====================================     
Column {.tabset}
-------------------------------------

### Ch5
Data Transformaion 

```{r echo = TRUE}
library(nycflights13)
library(tidyverse)

flights

##dplyr
#Pick observations by their values (filter()).
#Reorder the rows (arrange()).
#Pick variables by their names (select()).
#Create new variables with functions of existing variables (mutate()).
#Collapse many values down to a single summary (summarise()).


##filter
#filter() allows you to subset observations based on their values. The first argument is the name of the data frame. The second and subsequent arguments are the expressions that filter the data frame. 

jan1 <- filter(flights, month == 1, day == 1)

(dec25 <- filter(flights, month == 12, day == 25))
#wrap the assignment in parentheses

near(sqrt(2) ^ 2,  2)
near(1 / 49 * 49, 1)
#Computers use finite precision arithmetic, so remember that every number you see is an approximation. Instead of relying on ==, use near()

#Finds all flights that departed in November or December
filter(flights, month == 11 | month == 12)

#"x %in% y" will select every row where x is one of the values in y
nov_dec <- filter(flights, month %in% c(11, 12))

filter(flights, !(arr_delay > 120 | dep_delay > 120))
filter(flights, arr_delay <= 120, dep_delay <= 120)
#De Morgan’s law: !(x & y) is the same as !x | !y, and !(x | y) is the same as !x & !y

# Missing Value
NA > 5
NA + 10
NA / 2

#to determine if a value is missing, use "is.na()"
x<- NA
is.na(x)


df <- tibble(x = c(1, NA, 3))
filter(df, x > 1)
#filter() only includes rows where the condition is TRUE; it excludes both FALSE and NA values. If you want to preserve missing values, ask for them explicitly
filter(df, is.na(x) | x > 1)

##Arrange
#arrange() works similarly to filter() except that instead of selecting rows, it changes their order

arrange(flights, year, month, day)
#If provide more than one column name, each additional column will be used to break ties in the values of preceding columns

#desc() to re-order by a column in descending order
arrange(flights, desc(dep_delay))


#Missing values are always sorted at the end
df <- tibble(x = c(5, 2, NA))
arrange(df, x)

arrange(df, desc(x))


## select
# allows you to rapidly zoom in on a useful subset using operations based on the names of the variables

# Select columns by name
select(flights, year, month, day)


# Select all columns between year and day (inclusive)
select(flights, year:day)

# Select all columns except those from year to day (exclusive)
select(flights, -(year:day))

#starts_with("abc"): matches names that begin with “abc”.
#ends_with("xyz"): matches names that end with “xyz”.
#contains("ijk"): matches names that contain “ijk”.
#matches("(.)\\1"): selects variables that match a regular expression. This one matches any variables that contain repeated characters. You’ll learn more about regular expressions in strings.
#num_range("x", 1:3): matches x1, x2 and x3.

rename(flights, tail_num = tailnum)

#have a handful of variables to move to the start of the data frame.
select(flights, time_hour, air_time, everything())

## Add new variables with mutate()
flights_sml <- select(flights, 
  year:day, 
  ends_with("delay"), 
  distance, 
  air_time
)

mutate(flights_sml,
  gain = dep_delay - arr_delay,
  hours = air_time / 60,
  gain_per_hour = gain / hours
)

#only want to keep the new variables
transmute(flights,
  gain = dep_delay - arr_delay,
  hours = air_time / 60,
  gain_per_hour = gain / hours
)



#Arithmetic operators: +, -, *, /, ^. These are all vectorised, using the so called “recycling rules”. If one parameter is shorter than the other, it will be automatically extended to be the same length. This is most useful when one of the arguments is a single number: air_time / 60, hours * 60 + minute, etc.

#Arithmetic operators are also useful in conjunction with the aggregate functions you’ll learn about later. For example, x / sum(x) calculates the proportion of a total, and y - mean(y) computes the difference from the mean.

#Modular arithmetic: %/% (integer division) and %% (remainder), where x == y * (x %/% y) + (x %% y). Modular arithmetic is a handy tool because it allows you to break integers up into pieces. For example, in the flights dataset, you can compute hour and minute from dep_time with:


transmute(flights,
  dep_time,
  hour = dep_time %/% 100,
  minute = dep_time %% 100
)

#Offsets: lead() and lag() allow you to refer to leading or lagging values. This allows you to compute running differences (e.g. x - lag(x)) or find when values change (x != lag(x)). They are most useful in conjunction with group_by(), which you’ll learn about shortly.
(x <- 1:10)
lag(x)
lead(x)


x
cumsum(x)
cummean(x)
#Cumulative and rolling aggregates: R provides functions for running sums, products, mins and maxes: cumsum(), cumprod(), cummin(), cummax(); and dplyr provides cummean() for cumulative means

#ranking
y <- c(1, 2, 2, NA, 3, 4)
min_rank(y)
min_rank(desc(y))

row_number(y)
dense_rank(y)
percent_rank(y)
cume_dist(y)

##Summaries 
# collapses a data frame to a single row

summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
#remove the NA


by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))



## %>% Pipe to combine multiple operations

by_dest <- group_by(flights, dest)
#Group flights by destination.

delay <- summarise(by_dest,
  count = n(),
  dist = mean(distance, na.rm = TRUE),
  delay = mean(arr_delay, na.rm = TRUE)
)
#Summarise to compute distance, average delay, and number of flights.

delay <- filter(delay, count > 20, dest != "HNL")
#Filter to remove noisy points and Honolulu airport, which is almost twice as far away as the next closest airport


delays <- flights %>% 
  group_by(dest) %>% 
  summarise(
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>% 
  filter(count > 20, dest != "HNL")


# delays increase with distance up to ~750 miles and then decrease. Maybe as flights get longer there's more ability to make up delays in the air
#ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
#  geom_point(aes(size = count), alpha = 1/3) +
#  geom_smooth(se = FALSE)

#remove the cancel flight(NA)
not_cancelled <- flights %>% 
  filter(!is.na(dep_delay), !is.na(arr_delay))

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(mean = mean(dep_delay))


#Whenever you do any aggregation, it’s always a good idea to include either a count (n()), or a count of non-missing values (sum(!is.na(x))). That way you can check that you’re not drawing conclusions based on very small amounts of data

# The planes (identified by their tail number) that have the highest average delays
delays <- not_cancelled %>% 
  group_by(tailnum) %>% 
  summarise(
    delay = mean(arr_delay)
  )

#ggplot(data = delays, mapping = aes(x = delay)) + 
#  geom_freqpoly(binwidth = 10)

delays <- not_cancelled %>% 
  group_by(tailnum) %>% 
  summarise(
    delay = mean(arr_delay, na.rm = TRUE),
    n = n()
  )

#ggplot(data = delays, mapping = aes(x = n, y = delay)) + 
#  geom_point(alpha = 1/10)


delays %>% 
  filter(n > 25) %>% 
  ggplot(mapping = aes(x = n, y = delay)) + 
    geom_point(alpha = 1/10)

# Convert to a tibble so it prints nicely
batting <- as_tibble(Lahman::Batting)

batters <- batting %>% 
  group_by(playerID) %>% 
  summarise(
    ba = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE),
    ab = sum(AB, na.rm = TRUE)
  )

batters %>% 
  filter(ab > 100) %>% 
  ggplot(mapping = aes(x = ab, y = ba)) +
    geom_point() + 
    geom_smooth(se = FALSE)


batters %>% 
  arrange(desc(ba))


not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    avg_delay1 = mean(arr_delay),
    avg_delay2 = mean(arr_delay[arr_delay > 0]) # the average positive delay
  )

# Why is distance to some destinations more variable than to others?
not_cancelled %>% 
  group_by(dest) %>% 
  summarise(distance_sd = sd(distance)) %>% 
  arrange(desc(distance_sd))

# When do the first and last flights leave each day?
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    first = min(dep_time),
    last = max(dep_time)
  )

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    first_dep = first(dep_time), 
    last_dep = last(dep_time)
  )

not_cancelled %>% 
  group_by(year, month, day) %>% 
  mutate(r = min_rank(desc(dep_time))) %>% 
  filter(r %in% range(r))

# Which destinations have the most carriers?
not_cancelled %>% 
  group_by(dest) %>% 
  summarise(carriers = n_distinct(carrier)) %>% 
  arrange(desc(carriers))

not_cancelled %>% 
  count(dest)

# “count” (sum) the total number of miles a plane flew
not_cancelled %>% 
  count(tailnum, wt = distance)

# How many flights left before 5am? (these usually indicate delayed flights from the previous day)
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(n_early = sum(dep_time < 500))



# What proportion of flights are delayed by more than an hour?
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(hour_perc = mean(arr_delay > 60))

daily <- group_by(flights, year, month, day)
(per_day   <- summarise(daily, flights = n()))

(per_month <- summarise(per_day, flights = sum(flights)))

(per_year  <- summarise(per_month, flights = sum(flights)))

##ungroup
daily %>% 
  ungroup() %>%             # no longer grouped by date
  summarise(flights = n())  # all flights

#Find the worst members of each group

flights_sml %>% 
  group_by(year, month, day) %>%
  filter(rank(desc(arr_delay)) < 10)


#Find all groups bigger than a threshold

popular_dests <- flights %>% 
  group_by(dest) %>% 
  filter(n() > 365)
popular_dests


#Standardise to compute per group metrics
popular_dests %>% 
  filter(arr_delay > 0) %>% 
  mutate(prop_delay = arr_delay / sum(arr_delay)) %>% 
  select(year:day, dest, arr_delay, prop_delay)


```


### Ch12
Tidy data

```{r echo = TRUE}
library(tidyverse)

#There are three interrelated rules which make a dataset tidy:
#1.Each variable must have its own column.
#2.Each observation must have its own row.
#3.Each value must have its own cell.

table1 #tidy table

table2

table3

table4a

table4b

# Compute rate per 10,000
table1 %>% 
  mutate(rate = cases / population * 10000)


# Compute cases per year
table1 %>% 
  count(year, wt = cases)


# Visualise changes over time
#library(ggplot2)
#ggplot(table1, aes(year, cases)) + 
#  geom_line(aes(group = country), colour = "grey50") + 
#  geom_point(aes(colour = country))


##Gather
#some of the column names are not names of variables, but values of a variable. Take table4a: the column names 1999 and 2000 represent values of the year variable, and each row represents two observations, not one
table4a

#The name of the variable whose values form the column names. I call that the key, and here it is year.

#The name of the variable whose values are spread over the cells. I call that value, and here it’s the number of cases.
table4a %>% 
  gather(`1999`, `2000`, key = "year", value = "cases")

#the variable stored in the cell values:
table4b %>% 
  gather(`1999`, `2000`, key = "year", value = "population")

#To combine the tidied versions of table4a and table4b into a single tibble, we need to use dplyr::left_join()

tidy4a <- table4a %>% 
  gather(`1999`, `2000`, key = "year", value = "cases")
tidy4b <- table4b %>% 
  gather(`1999`, `2000`, key = "year", value = "population")
left_join(tidy4a, tidy4b)


##Spreading
#Spreading is the opposite of gathering. You use it when an observation is scattered across multiple rows
table2
#The column that contains variable names, the key column. Here, it’s type.
#The column that contains values from multiple variables, the value column. Here it’s count.

table2 %>%
    spread(key = type, value = count)

#gather() makes wide tables narrower and longer; spread() makes long tables shorter and wider.

##Separarte
#pulls apart one column into multiple columns, by splitting wherever a separator character appears

table3
#The rate column contains both cases and population variables, and we need to split it into two variables
table3 %>% 
  separate(rate, into = c("cases", "population"))
#separate() will split values wherever it sees a non-alphanumeric character

table3 %>% 
  separate(rate, into = c("cases", "population"), sep = "/")
#If you wish to use a specific character to separate a column, you can pass the character to the sep argument of separate().

table3 %>% 
  separate(rate, into = c("cases", "population"), convert = TRUE)
#ask separate() to try and convert to better types using convert = TRUE

#pass a vector of integers to sep. separate() will interpret the integers as positions to split at. Positive values start at 1 on the far-left of the strings; negative value start at -1 on the far-right of the strings. When using integers to separate strings, the length of sep should be one less than the number of names in "into"

#Separate the last two digits of each year
table3 %>% 
  separate(year, into = c("century", "year"), sep = 2)


#unite() is the inverse of separate(): it combines multiple columns into a single column

#use unite() to rejoin the century and year columns

tidyr::table5

table5%>% 
  unite(new, century, year)

#use the sep argument
table5 %>% 
  unite(new, century, year, sep = "")


##Missing values

#Explicitly, i.e. flagged with NA.
#Implicitly, i.e. simply not present in the data.

stocks <- tibble(
  year   = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
  qtr    = c(   1,    2,    3,    4,    2,    3,    4),
  return = c(1.88, 0.59, 0.35,   NA, 0.92, 0.17, 2.66)
)

#make the implicit missing value explicit by putting years in the columns
stocks %>% 
  spread(year, return)

#set na.rm = TRUE in gather() to turn explicit missing values implicit
stocks %>% 
  spread(year, return) %>% 
  gather(year, return, `2015`:`2016`, na.rm = TRUE)


stocks %>% 
  complete(year, qtr)
#complete() takes a set of columns, and finds all unique combinations. It then ensures the original dataset contains all those values, filling in explicit NAs where necessary


#when a data source has primarily been used for data entry, missing values indicate that the previous value should be carried forward

treatment <- tribble(
  ~ person,           ~ treatment, ~response,
  "Derrick Whitmore", 1,           7,
  NA,                 2,           10,
  NA,                 3,           9,
  "Katherine Burke",  1,           4
)

#takes a set of columns where you want missing values to be replaced by the most recent non-missing value

treatment %>% 
  fill(person)


```


### Ch20

Vectors

```{r echo = TRUE}

library(tidyverse)

#2 properties of vectors: type and length
#Type
typeof(letters)
typeof(1:10)

#length

x <- list("a", "b", 1:10)
length(x)


#Logical vectors are the simplest type of atomic vector because they can take only three possible values: FALSE, TRUE, and NA
#comparisons
1:10 %% 3 == 0
#create by hand
c(TRUE, TRUE, FALSE, NA)

#Integer and double vectors are known collectively as numeric vectors
#To make an integer, place an L after the number

typeof(1)
typeof(1L)
1.5L

#Doubles are approximations.
x <- sqrt(2) ^ 2
x
x - 2

# when working with floating point numbers: most calculations include some approximation error. Instead of comparing floating point numbers using ==, you should use dplyr::near() which allows for some numerical tolerance

#Integers have one special value: NA, while doubles have four: NA, NaN, Inf and -Inf. All three special values NaN, Inf and -Inf can arise during division

c(-1, 0, 1) / 0


#Avoid using == to check for these other special values. Instead use the helper functions is.finite(), is.infinite(), and is.nan()


#Character 

#each unique string is only stored in memory once, and every use of the string points to that representation. This reduces the amount of memory needed by duplicated strings. You can see this behaviour in practice with pryr::object_size()

x <- "This is a reasonably long string."
pryr::object_size(x)

y <- rep(x, 1000)
pryr::object_size(y)

#y doesn’t take up 1,000x as much memory as x, because each element of y is just a pointer to that same string. A pointer is 8 bytes, so 1000 pointers to a 136 B string is 8 * 1000 + 136 = 8.13 kB

#Missing Value
#Each type of atomic vector has its own missing value

NA            # logical
NA_integer_   # integer
NA_real_      # double
NA_character_ # character


#Coercion
#There are two ways to convert, or coerce, one type of vector to another:

#1 Explicit coercion 
# as.logical(), as.integer(), as.double(), or as.character()

#2 Implicit coercion
#EX: using a logical vector in a numeric context
#TRUE=1, FALSE=0
x <- sample(20, 100, replace = TRUE)
y <- x > 10
sum(y)  # how many are greater than 10
mean(y) # proportion are greater than 10

#when a vector containing multiple types with c(): the most complex type always wins.

typeof(c(TRUE, 1L))
typeof(c(1L, 1.5))
typeof(c(1.5, "a"))


#Each predicate also comes with a “scalar” version, like is_scalar_atomic(), which checks that the length is 1. This is useful, for example, if you want to check that an argument to your function is a single logical value.

#Vector recycling: implicitly coerce the length of vectors, because the shorter vector is repeated, or recycled, to the same length as the longer vector.

# Vectorised, meaning that they will operate on a vector of numbers
sample(10) + 100
runif(10) > 0.5

#R will expand the shortest vector to the same length as the longest, so called recycling
1:10 + 1:2

1:10 + 1:3

#While vector recycling can be used to create very succinct, clever code, it can also silently conceal problems. For this reason, the vectorised functions in tidyverse will throw errors when you recycle anything other than a scalar. If you do want to recycle, you’ll need to do it yourself with rep():

tibble(x = 1:4, y = rep(1:2, 2))
tibble(x = 1:4, y = rep(1:2, each = 2))



# Name vector during creation with c():

c(x = 1, y = 2, z = 4)
#after the fact with purrr::set_names()
set_names(1:3, c("a", "b", "c"))

##Subset

#1.A numeric vector containing only integers. The integers must either be all positive, all negative, or zero. Subsetting with positive integers keeps the elements at those positions:

x <- c("one", "two", "three", "four", "five")
x[c(3, 2, 5)]

x[c(1, 1, 5, 5, 5, 2)]

#Negative values drop the elements at the specified positions
x[c(-1, -3, -5)]

#2.Subsetting with a logical vector keeps all values corresponding to a TRUE value. This is most often useful in conjunction with the comparison functions

x <- c(10, 3, NA, 5, 8, 1, NA)
# All non-missing values of x
x[!is.na(x)]

# All even (or missing!) values of x
x[x %% 2 == 0]

#3.If you have a named vector, you can subset it with a character vector

x <- c(abc = 1, def = 2, xyz = 5)
x[c("xyz", "def")]

##20.5 Recursive vectors (lists)

#create a list 
x <- list(1, 2, 3)
x
str(x)
x_named <- list(a = 1, b = 2, c = 3)
str(x_named)
#list can contain a mix of objects
y <- list("a", 1L, 1.5, TRUE)
str(y)

#or even cotain other list 
z <- list(list(1, 2), list(3, 4))
str(z)


x1 <- list(c(1, 2), c(3, 4))
x2 <- list(list(1, 2), list(3, 4))
x3 <- list(1, list(2, list(3)))


a <- list(a = 1:3, b = "a string", c = pi, d = list(-1, -5))
str(a[1:2])
str(a[4])  #[ extracts a sub-list. The result will always be a list

#[[ extracts a single component from a list. It removes a level of hierarchy from the list.
str(a[[1]])
str(a[[4]])

# $ has same function without quote
a$a
a[["a"]]


##Attributes
x <- 1:10
attr(x, "greeting")

attr(x, "greeting") <- "Hi!"
attr(x, "farewell") <- "Bye!"
attributes(x)


##Factors

# Factors are designed to represent categorical data that can take a fixed set of possible values. Factors are built on top of integers, and have a levels attribute


x <- factor(c("ab", "cd", "ab"), levels = c("ab", "cd", "ef"))
typeof(x)
attributes(x)

#Dates in R are numeric vectors that represent the number of days since 1 January 1970

x <- as.Date("1971-01-01")
unclass(x)
typeof(x)
attributes(x)

#Date-times are numeric vectors with class POSIXct that represent the number of seconds since 1 January 1970. (In case you were wondering, “POSIXct” stands for “Portable Operating System Interface”, calendar time.)
x <- lubridate::ymd_hm("1970-01-01 01:00")
unclass(x)
typeof(x)
attributes(x)


#The tzone attribute is optional. It controls how the time is printed, not what absolute time it refers to
attr(x, "tzone") <- "US/Pacific"
x

attr(x, "tzone") <- "US/Eastern"
x

# Another type of date-times called POSIXlt. These are built on top of named lists
y <- as.POSIXlt(x)
typeof(y)
attributes(y)

#POSIXct’s are always easier to work with, so if you find you have a POSIXlt, you should always convert it to a regular data time lubridate::as_date_time()


#Tibbles are augmented lists: they have class “tbl_df” + “tbl” + “data.frame”, and names (column) and row.names attributes
tb <- tibble::tibble(x = 1:5, y = 5:1)
typeof(tb)
attributes(tb)


#The difference between a tibble and a list is that all the elements of a data frame must be vectors with the same length. All functions that work with tibbles enforce this constraint.

#Traditional data.frames have a very similar structure
df <- data.frame(x = 1:5, y = 5:1)
typeof(df)
attributes(df)

#The class of tibble includes “data.frame” which means tibbles inherit the regular data frame behaviour by default



```