MKT500V Introduction to R Day 2: Advanced

Sally Chen

8/26/2018

Slides

http://rpubs.com/sallychen/413763

What we have learned

What we have learned

What we have learned

Today: Agenda

What do we do in data analysis?

R Project: Powerful support for projects management

First step: Create a R project

getwd()   # get working directory
## [1] "/Users/sallychen/Dropbox/Introduction to R/intror 2018"

Rstudio Panes

Create some objects

my_vector = c(1:10)
my_name = c("Sally","Chen")
my_matrix = matrix(my_vector,nrow = 2, ncol =5)
my_list = list(my_vector,my_name,my_matrix)
class(my_list[1]) 
## [1] "list"
class(my_list[[1]])
## [1] "integer"
my_list[1] # the member object
## [[1]]
##  [1]  1  2  3  4  5  6  7  8  9 10
my_list[[1]] # a new list
##  [1]  1  2  3  4  5  6  7  8  9 10
ls()
rm(my_vector)  # careful, remove objects will not be recovered
ls()

Read in Data

https://www.dropbox.com/s/v0iumti48mmba9w/movie_top5000.xlsx?dl=0 (Excel)

https://www.dropbox.com/s/3lecinuiphjpn25/movie_top5000.RData?dl=0 (RData)

movie_top5000 <- read.csv("movie_top5000.csv") # Mac

Console: Temporary Computing

is.data.frame(movie_top5000)  # check whether movie_top5000 is a data.frame
## [1] TRUE
head(movie_top5000)  # look at the first rows of a data.frame
## # A tibble: 6 x 10
##    Rank      imdb                                             Name  Year
##   <int>     <chr>                                            <chr> <int>
## 1     1 tt0499549                                    Avatar (2009)  2009
## 2     2 tt0120338                                   Titanic (1997)  1997
## 3     3 tt0848228                              The Avengers (2012)  2012
## 4     4 tt0468569                           The Dark Knight (2008)  2008
## 5     5 tt0120915 Star Wars: Episode I - The Phantom Menace (1999)  1999
## 6     6 tt1345836                     The Dark Knight Rises (2012)  2012
## # ... with 6 more variables: Rating <dbl>, Length <int>, Genres <chr>,
## #   MPAA <chr>, Release <chr>, Gross_Box <dbl>
anyNA(movie_top5000$Rating) # check whether there are NA in Rating
## [1] FALSE
is.numeric(movie_top5000$Rank) #check whether Rank is numeric
## [1] TRUE
dim(my_matrix) # check the dimensions
## [1] 2 5
my_matrix * my_matrix  # element-wise multiplication
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    9   25   49   81
## [2,]    4   16   36   64  100
my_matrix %*% t(my_matrix) # algebra multiplication
##      [,1] [,2]
## [1,]  165  190
## [2,]  190  220
t(my_matrix) %*% my_matrix # algebra multiplication
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    5   11   17   23   29
## [2,]   11   25   39   53   67
## [3,]   17   39   61   83  105
## [4,]   23   53   83  113  143
## [5,]   29   67  105  143  181

Console: Temporary Computing

colnames(movie_top5000);#see what columns a dataset has
##  [1] "Rank"      "imdb"      "Name"      "Year"      "Rating"   
##  [6] "Length"    "Genres"    "MPAA"      "Release"   "Gross_Box"
summary(movie_top5000$Rating);# see summary statistics if the Rating column
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.700   5.800   6.500   6.378   7.100   9.000
hist(movie_top5000$Rating) # histrogram of rating 

plot(movie_top5000$Rating,movie_top5000$Gross_Box) # point plot the relationship between rating & box office

cor(movie_top5000$Gross_Box,movie_top5000$Rating)  # see the correlation between rating and box office
## [1] 0.1315947

Console: Temporary Computing

help("seq")
help("rnorm")

Console: Summary

Source: Ready to Save and Produce

R script

Writting R script to save User-generated functions

Constructing a function: Basic

myfunction <- function(arg1, arg2, … ){ statements(arg1,arg2…) return(object) }

myfunction(arg1=value1,arg2=value2)
square_sum = function(a, b){  #attributes

sum = a^2+b^2;  #statement a^2=a*a

return(sum)
}; # output
square_sum(a=1,b=2)
## [1] 5

Constructing a function: Default value

square_sum()
square_sum = function(a=0, b=0){  #attributes

sum = a^2+b^2  #statement

return(sum)
}; # output
square_sum() # default values of both a and b are used
## [1] 0
square_sum(a=3,b=2) # specified a and b, no defaults used
## [1] 13
square_sum(a=3)  # specified a, default of b used
## [1] 9

Calling a function: Arguments Matching

square_diff = function(a=0, b=0){

diff = a^2-b^2

return(diff)}
square_diff() # no values parsed, defaults are used
## [1] 0
square_diff(a=1,b=2) # both a and b parsed
## [1] -3
square_diff(a=2,b=1) # both a and b parsed 
## [1] 3
square_diff(1,2)  # value parsed without argument name, matched by position
## [1] -3
square_diff(2,1) # value parsed without argument name, matched by position
## [1] 3

In class Exercise 1

c_to_f<-function(C){
  F = C*1.8 +32
  return(F)
}
c_to_f(26)
## [1] 78.8

Writting R script to save analysis

movie_year<-table(movie_top5000$Year)  # count how many movies in each year
print(movie_year) # you need the print() function if you want to see the output when sourcing the script
movie_top5000_new<-movie_top5000[,c("Rank","Year","Rating","Length","Gross_Box")] # create a seperate movie dataset containning numeric values
movie_correlation <- cor(movie_top5000_new,use="complete.obs") # correlation analysis
print(movie_correlation)  # you need the print() function if you want to see the output when sourcing the script

Environment: Objects archive

History: Commands archive

Files: Archive for imports and exports

Save Your Objects

save(movie_top5000_new,movie_correlation,file="movie_analysis.RData")
rm(movie_top5000_new,movie_correlation) # let's first remove the two stored objects from the environment
head(movie_top5000_new)  #object is removed, R returned error message
load("movie_analysis.RData") # load the stored data back
head(movie_top5000_new)
## # A tibble: 6 x 5
##    Rank  Year Rating Length Gross_Box
##   <int> <int>  <dbl>  <int>     <dbl>
## 1     1  2009    7.9    162  7.61e+08
## 2     2  1997    7.7    194  6.59e+08
## 3     3  2012    8.2    143  6.23e+08
## 4     4  2008    9.0    152  5.33e+08
## 5     5  1999    6.6    136  4.75e+08
## 6     6  2012    8.5    165  4.48e+08

Save Your Workspace

Reopen your Rproject

A Quick Summary

In Class Exercise 2

nrow(flights)
table(flights$month)
max(flights$dep_delay,na.rm=TRUE)
max(flights$air_time,na.rm =TRUE)
flights_sample<-flights[1:1000,]
save(flights_sample,file="flights_sample.RData")
q(save="yes")

Welcome Back!

R packages

R packages

mypackages = c("dplyr","lubridate","ggplot2")
install.packages(mypackages)
library(dplyr)
library(ggplot2)
library(lubridate)

Data manipulation with dplyr

head(flights) # use head() to see the top rows of the data
## # A tibble: 6 x 22
##       X1  year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int> <int>    <int>          <int>     <int>    <int>
## 1  89674  2013    12     7     2040           2123       -43       40
## 2 113634  2013     2     3     2022           2055       -33     2240
## 3  64502  2013    11    10     1408           1440       -32     1549
## 4   9620  2013     1    11     1900           1930       -30     2233
## 5  24916  2013     1    29     1703           1730       -27     1947
## 6 287743  2013     8     9      729            755       -26     1002
## # ... with 14 more variables: sched_arr_time <int>, arr_delay <int>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <int>, distance <int>, hour <int>, minute <int>,
## #   time_hour <dttm>, speed <dbl>, weekday <chr>
dim(flights) # use dim() to see the dimensions of the data
## [1] 336776     22

Data Manipulation: Filter

filter(flights, month == 1, day == 1)
## # A tibble: 842 x 22
##       X1  year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int> <int>    <int>          <int>     <int>    <int>
##  1   210  2013     1     1      940            955       -15     1226
##  2   770  2013     1     1     2030           2045       -15     2150
##  3   593  2013     1     1     1716           1730       -14     1947
##  4   212  2013     1     1      946            959       -13     1146
##  5   820  2013     1     1     2217           2229       -12      249
##  6   148  2013     1     1      839            850       -11     1027
##  7   682  2013     1     1     1849           1900       -11     2131
##  8   107  2013     1     1      800            810       -10      949
##  9   114  2013     1     1      805            815       -10     1006
## 10   127  2013     1     1      820            830       -10      940
## # ... with 832 more rows, and 14 more variables: sched_arr_time <int>,
## #   arr_delay <int>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <int>, distance <int>, hour <int>,
## #   minute <int>, time_hour <dttm>, speed <dbl>, weekday <chr>
a = 3
b = 2
a == b # if a==b ?
## [1] FALSE
a = b   # assign the value fo b to a
print(a)
## [1] 2
a == b   # if a == b ?
## [1] TRUE

Data Manipulation: Order

arrange(flights,dep_delay)  # ascending order by default, see the result
## # A tibble: 336,776 x 22
##        X1  year month   day dep_time sched_dep_time dep_delay arr_time
##     <int> <int> <int> <int>    <int>          <int>     <int>    <int>
##  1  89674  2013    12     7     2040           2123       -43       40
##  2 113634  2013     2     3     2022           2055       -33     2240
##  3  64502  2013    11    10     1408           1440       -32     1549
##  4   9620  2013     1    11     1900           1930       -30     2233
##  5  24916  2013     1    29     1703           1730       -27     1947
##  6 287743  2013     8     9      729            755       -26     1002
##  7  48336  2013    10    23     1907           1932       -25     2143
##  8 164136  2013     3    30     2030           2055       -25     2213
##  9 137608  2013     3     2     1431           1455       -24     1601
## 10 197293  2013     5     5      934            958       -24     1225
## # ... with 336,766 more rows, and 14 more variables: sched_arr_time <int>,
## #   arr_delay <int>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <int>, distance <int>, hour <int>,
## #   minute <int>, time_hour <dttm>, speed <dbl>, weekday <chr>
flights <- arrange(flights,dep_delay)  # overwrite flights with the ordered data.frame
arrange(flights,desc(dep_delay))  #use desc() to specify decending order
## # A tibble: 336,776 x 22
##        X1  year month   day dep_time sched_dep_time dep_delay arr_time
##     <int> <int> <int> <int>    <int>          <int>     <int>    <int>
##  1   7073  2013     1     9      641            900      1301     1242
##  2 235779  2013     6    15     1432           1935      1137     1607
##  3   8240  2013     1    10     1121           1635      1126     1239
##  4 327044  2013     9    20     1139           1845      1014     1457
##  5 270377  2013     7    22      845           1600      1005     1044
##  6 173993  2013     4    10     1100           1900       960     1342
##  7 151975  2013     3    17     2321            810       911      135
##  8 247041  2013     6    27      959           1900       899     1236
##  9 270988  2013     7    22     2257            759       898      121
## 10  87239  2013    12     5      756           1700       896     1058
## # ... with 336,766 more rows, and 14 more variables: sched_arr_time <int>,
## #   arr_delay <int>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <int>, distance <int>, hour <int>,
## #   minute <int>, time_hour <dttm>, speed <dbl>, weekday <chr>
arrange(flights,month,day,desc(dep_delay))  #ordering by multiple attributes
## # A tibble: 336,776 x 22
##       X1  year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int> <int>    <int>          <int>     <int>    <int>
##  1   152  2013     1     1      848           1835       853     1001
##  2   835  2013     1     1     2343           1724       379      314
##  3   650  2013     1     1     1815           1325       290     2120
##  4   816  2013     1     1     2205           1720       285       46
##  5   674  2013     1     1     1842           1422       260     1958
##  6   802  2013     1     1     2115           1700       255     2330
##  7   747  2013     1     1     2006           1630       216     2230
##  8   831  2013     1     1     2312           2000       192       21
##  9   730  2013     1     1     1942           1705       157     2124
## 10   725  2013     1     1     1938           1703       155     2109
## # ... with 336,766 more rows, and 14 more variables: sched_arr_time <int>,
## #   arr_delay <int>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <int>, distance <int>, hour <int>,
## #   minute <int>, time_hour <dttm>, speed <dbl>, weekday <chr>

Data Manipulation: Select

select(flights, year, month, day, carrier)
## # A tibble: 336,776 x 4
##     year month   day carrier
##    <int> <int> <int>   <chr>
##  1  2013    12     7      B6
##  2  2013     2     3      DL
##  3  2013    11    10      EV
##  4  2013     1    11      DL
##  5  2013     1    29      F9
##  6  2013     8     9      MQ
##  7  2013    10    23      EV
##  8  2013     3    30      MQ
##  9  2013     3     2      9E
## 10  2013     5     5      B6
## # ... with 336,766 more rows

Data Manipulation: Adding New column with existing columns

flights<-mutate(flights, speed = distance/air_time*60, 
                total_delay = dep_delay + arr_delay) # mutate keep the original columns 
flights$speed[1:10] # the first 10 rows of speed
##  [1] 368.1509 438.1481 264.2308 435.9712 388.8000 342.2727 341.0811
##  [8] 346.2069 328.3636 430.4000
flights$total_delay[1:10]
##  [1]   5 -91 -42 -40 -37 -19 -25 -62 -54 -68

Data Manipulation: Summarise

summarise(flights,max(speed,na.rm=TRUE),
                  max(arr_delay,na.rm=TRUE),
                  max(distance,na.rm=TRUE),
                  min(distance,na.rm=TRUE),
                  n_distinct(dest))
## # A tibble: 1 x 5
##   `max(speed, na.rm = TRUE)` `max(arr_delay, na.rm = TRUE)`
##                        <dbl>                          <dbl>
## 1                   703.3846                           1272
## # ... with 3 more variables: `max(distance, na.rm = TRUE)` <dbl>,
## #   `min(distance, na.rm = TRUE)` <dbl>, `n_distinct(dest)` <int>

Data Manipulation: The pipe operator %>%

x<-filter(flights,month==1) # create a temporary data x
summarise(x,max(dep_delay,na.rm=TRUE)
          ,min(dep_delay,na.rm=TRUE)) # calculate the max and mean on the temporary data x
## # A tibble: 1 x 2
##   `max(dep_delay, na.rm = TRUE)` `min(dep_delay, na.rm = TRUE)`
##                            <dbl>                          <dbl>
## 1                           1301                            -30
filter(flights,month==1) %>%
summarise(max(dep_delay,na.rm=TRUE),
          min(dep_delay,na.rm=TRUE))
## # A tibble: 1 x 2
##   `max(dep_delay, na.rm = TRUE)` `min(dep_delay, na.rm = TRUE)`
##                            <dbl>                          <dbl>
## 1                           1301                            -30

Quick Summary

In Class Exercise 3

flight_1<-select(flights,month,day,origin,dest,air_time)
flight_1<-arrange(flight_1,air_time)
print(flight_1[1:10,])
## # A tibble: 10 x 5
##    month   day origin  dest air_time
##    <int> <int>  <chr> <chr>    <int>
##  1     4    13    EWR   BDL       20
##  2     1    16    EWR   BDL       20
##  3     2     5    EWR   BDL       21
##  4     3     2    LGA   BOS       21
##  5     8    18    EWR   BDL       21
##  6     2    12    EWR   PHL       21
##  7     9     3    EWR   BDL       21
##  8     9     3    EWR   BDL       21
##  9     2     3    EWR   PHL       21
## 10    12     6    EWR   BDL       21
filter(flight_1,month==1,day==1,origin=="LGA") %>% 
summarise(mean(air_time,na.rm=TRUE))
## # A tibble: 1 x 1
##   `mean(air_time, na.rm = TRUE)`
##                            <dbl>
## 1                       144.3305

Data Analysis: Grouping and Aggregating

Data Analysis: Grouping and Aggregating

delay_origin_summary <- flights %>% 
                        group_by(origin)  %>% 
                         summarise(
                         avg_delay = mean(dep_delay,na.rm = TRUE), 
                         flights = n()
                        )
delay_origin_summary 
## # A tibble: 3 x 3
##   origin avg_delay flights
##    <chr>     <dbl>   <int>
## 1    EWR  15.10795  120835
## 2    JFK  12.11216  111279
## 3    LGA  10.34688  104662

Quick Summary: Grouping and Aggregating

In Class Exercise 4

## # A tibble: 224 x 5
## # Groups:   origin [3]
##    origin  dest no_of_flights mean_dep_delay mean_arr_delay
##     <chr> <chr>         <int>          <dbl>          <dbl>
##  1    EWR   TYS           323       41.81847       41.15016
##  2    EWR   CAE           104       36.33684       44.58511
##  3    EWR   TUL           315       34.90635       33.65986
##  4    LGA   SBN             6       31.33333       14.50000
##  5    EWR   OKC           346       30.56881       30.61905
##  6    LGA   BHM           296       29.77860       17.01119
##  7    LGA   CAE            12       29.50000       19.66667
##  8    EWR   DSM           411       29.32824       23.54872
##  9    EWR   JAC            23       28.70000       29.89474
## 10    EWR   ROC           513       27.86122       20.66393
## # ... with 214 more rows
## # A tibble: 16 x 4
##    carrier mean_dep_delay mean_arr_delay total_distance_covered
##      <chr>          <dbl>          <dbl>                  <int>
##  1      F9      20.215543     21.9207048                1109700
##  2      FL      18.726075     20.1159055                2167344
##  3      EV      19.955390     15.7964311               30498951
##  4      YV      18.996330     15.5569853                 225395
##  5      OO      12.586207     11.9310345                  16026
##  6      MQ      10.552041     10.7747334               15033955
##  7      WN      17.711744      9.6491199               12229203
##  8      B6      13.022522      9.4579733               58384137
##  9      9E      16.725769      7.3796692                9788152
## 10      UA      12.106073      3.5580111               89705524
## 11      US       3.782418      2.1295951               11365778
## 12      VX      12.869421      1.7644644               12902327
## 13      DL       9.264505      1.6443409               59507317
## 14      AA       8.586016      0.3642909               43864584
## 15      HA       4.900585     -6.9152047                1704186
## 16      AS       5.804775     -9.9308886                1715028

Date Object Manipulation

now <-"2018-08-26 10:14:45"
class(now)
## [1] "character"
now<-ymd_hms(now)
class(now)
## [1] "POSIXct" "POSIXt"
print(now)
## [1] "2018-08-26 10:14:45 UTC"

Date Object Manipulation

year(now)
## [1] 2018
weekdays(now)
## [1] "Sunday"
flights$weekday<-weekdays(flights$time_hour)
table(flights$weekday) 
## 
##    Friday    Monday  Saturday    Sunday  Thursday   Tuesday Wednesday 
##     50308     50690     38720     46357     50219     50422     50060

Plot with ggplot

Plot with ggplot: Bar plot

ggplot(flights) + geom_bar(aes(x = carrier)) 

# first parse the data flights to ggplot
# then tell geom_bar you want to group on carrier 
ggplot(flights) + geom_bar(aes(x = carrier,fill=origin))  

# "x=carrier" tells ggplot to count the number of flights for each carrier
# "fill = origin" tells ggplot to fill the color by origin
ggplot(flights) + geom_bar(aes(x = carrier,fill=origin)) + labs(title ="Flights by Carriers & Airport in New York City")

# use labs() to change lables to your plot, like x,y,title...

Plot with ggplot: Scatter plot

ggplot(flights) + geom_point(aes(x=distance,y=air_time))

ggplot2 cheat sheat

In Class Exercise 5

ggplot(flights) + geom_bar(aes(x=weekday,fill=origin)) + labs(title="Weekday Flights From Each NYC Airport")

ggplot(flights) + geom_point(aes(x = origin,y=distance))

Write Reports inside R with RNotebook

Write Reports: Steps

In Class Exercise 6

arrange(movie_top5000,desc(Rating))[1:10]
## # A tibble: 5,000 x 10
##     Rank      imdb
##    <int>     <chr>
##  1     4 tt0468569
##  2    19 tt0167260
##  3  3838 tt0137523
##  4    24 tt0167261
##  5    34 tt0120737
##  6    48 tt1375666
##  7    96 tt0816692
##  8  2461 tt0317248
##  9  3378 tt0114814
## 10  4673 tt0114369
## # ... with 4,990 more rows, and 8 more variables: Name <chr>, Year <int>,
## #   Rating <dbl>, Length <int>, Genres <chr>, MPAA <chr>, Release <chr>,
## #   Gross_Box <dbl>
year_movie_summary<-movie_top5000 %>% group_by(Year) %>% summarise(count = n(),avg_gross_box=mean(Gross_Box,na.rm=TRUE))

year_movie_summary
## # A tibble: 20 x 3
##     Year count avg_gross_box
##    <int> <int>         <dbl>
##  1  1995   235      21079366
##  2  1996   244      23688082
##  3  1997   254      25657862
##  4  1998   238      26555021
##  5  1999   244      29911893
##  6  2000   250      29542368
##  7  2001   257      31132759
##  8  2002   250      35851364
##  9  2003   233      37066275
## 10  2004   246      34054301
## 11  2005   269      32286569
## 12  2006   288      31935729
## 13  2007   277      33651675
## 14  2008   259      37165236
## 15  2009   253      41202087
## 16  2010   237      41900143
## 17  2011   241      41107705
## 18  2012   240      43382721
## 19  2013   253      42026126
## 20  2014   232      45489513
ggplot(year_movie_summary)+geom_point(aes(Year,avg_gross_box))

Quick Summary

Thank You!

Best wishes for your journey at Olin!