Introduction to R: Advanced

Sally Chen

8/25/2017

Slides

http://rpubs.com/sallychen/301512

What do we do in data analysis?

R Project: Powerful support for projects management

First step: Create a R project

getwd()   # get working directory
## [1] "/Users/sallychen/Desktop/Introduction to R/intror"
summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Rstudio Panes

Read in Data

library(readr)
movie_top5000 <- read_csv("~/Desktop/Introduction to R/intror/movie_top5000.csv")
## Parsed with column specification:
## cols(
##   Rank = col_integer(),
##   imdb = col_character(),
##   Name = col_character(),
##   Year = col_integer(),
##   Rating = col_double(),
##   Length = col_integer(),
##   Genres = col_character(),
##   MPAA = col_character(),
##   Release = col_character(),
##   Gross_Box = col_double()
## )

Console: Temporary Computing

class(movie_top5000)  #if you are not sure, check the class
## [1] "tbl_df"     "tbl"        "data.frame"
1+1;
## [1] 2
log10(10);
## [1] 1
exp(2);
## [1] 7.389056
a = 1:100
m = matrix(a,nr = 5,nc = 4, byrow=TRUE);
print(m);
##      [,1] [,2] [,3] [,4]
## [1,]    1    2    3    4
## [2,]    5    6    7    8
## [3,]    9   10   11   12
## [4,]   13   14   15   16
## [5,]   17   18   19   20
apply(m,sum,MARGIN = 1);
## [1] 10 26 42 58 74

Console: Temporary Computing

colnames(movie_top5000);#see what columns a dataset has
##  [1] "Rank"      "imdb"      "Name"      "Year"      "Rating"   
##  [6] "Length"    "Genres"    "MPAA"      "Release"   "Gross_Box"
summary(movie_top5000);# see some summary statistics
##       Rank          imdb               Name                Year     
##  Min.   :   1   Length:5000        Length:5000        Min.   :1995  
##  1st Qu.:1251   Class :character   Class :character   1st Qu.:2000  
##  Median :2500   Mode  :character   Mode  :character   Median :2005  
##  Mean   :2500                                         Mean   :2005  
##  3rd Qu.:3750                                         3rd Qu.:2009  
##  Max.   :5000                                         Max.   :2014  
##                                                                     
##      Rating          Length         Genres              MPAA          
##  Min.   :1.700   Min.   : 45.0   Length:5000        Length:5000       
##  1st Qu.:5.800   1st Qu.: 95.0   Class :character   Class :character  
##  Median :6.500   Median :105.0   Mode  :character   Mode  :character  
##  Mean   :6.378   Mean   :108.2                                        
##  3rd Qu.:7.100   3rd Qu.:117.0                                        
##  Max.   :9.000   Max.   :383.0                                        
##                  NA's   :35                                           
##    Release            Gross_Box        
##  Length:5000        Min.   :   189000  
##  Class :character   1st Qu.:  1190000  
##  Mode  :character   Median : 11600000  
##                     Mean   : 34184385  
##                     3rd Qu.: 41300000  
##                     Max.   :761000000  
## 
plot(movie_top5000$Rating) # point plot of rating

plot(movie_top5000$Gross_Box) # point plot of gross box

cor(movie_top5000$Gross_Box,movie_top5000$Rating)  # see the correlation
## [1] 0.1315947

Console: Temporary Computing

help("seq")
help("rnorm")

Source: When you are ready to save and produce

Source: Ready to Save and Produce

# define a sum function
simplesum = function(a, b) {
  
  s = a+b;
  
  return(s);
}

simpledif = function(a, b) {
  
  s = a-b;
  
  return(s);
}

simpledif = function(a, b) {
  
  s = a-b;
  
  return(s);
}

Sally = function(){   # function could be very flexible with no arguments
  a = rnorm(1)
  if(a>0){
  print("I'm really tired,see you tomorrow")
  }
  else{
   print("Shall we have a cup of coffee?") 
  }
}
simplesum(100,200)
## [1] 300
Sally()
## [1] "Shall we have a cup of coffee?"
Sally()
## [1] "Shall we have a cup of coffee?"

Source: Ready to Save and Produce

movie_year<-table(movie_top5000$Year)  # count how many movies in each year
movie_top5000_1<-movie_top5000[,c("Rank","Year","Rating","Length","Gross_Box")] # create a seperate movie dataset containning numeric values
movie_correlation <- cor(movie_top5000_1,use="complete.obs") # correlation analysis

Environment: objects archive

History

Files: Archive for imports and exports

Save Your Workspace

Reopen your Rproject

A Quick Summary

R packages

R packages

mypackages = c("dplyr","data.table","lubridate","nycflights13","ggplot2")
install.packages(mypackages)
library(dplyr)
library(ggplot2)
library(lubridate)
library(nycflights13)

Data manipulation with dplyr

flights = flights 
head(flights) # use head() to see the top rows of the data
## # A tibble: 6 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515         2      830
## 2  2013     1     1      533            529         4      850
## 3  2013     1     1      542            540         2      923
## 4  2013     1     1      544            545        -1     1004
## 5  2013     1     1      554            600        -6      812
## 6  2013     1     1      554            558        -4      740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>
dim(flights) # use dim() to see the dimensions of the data
## [1] 336776     19

Data Manipulation: Filter

filter(flights, month == 1, day == 1)
## # A tibble: 842 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     1     1      517            515         2      830
## 2   2013     1     1      533            529         4      850
## 3   2013     1     1      542            540         2      923
## 4   2013     1     1      544            545        -1     1004
## 5   2013     1     1      554            600        -6      812
## 6   2013     1     1      554            558        -4      740
## 7   2013     1     1      555            600        -5      913
## 8   2013     1     1      557            600        -3      709
## 9   2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 832 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Data Manipulation: Order

arrange(flights,dep_delay)  # ascending order by default
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013    12     7     2040           2123       -43       40
## 2   2013     2     3     2022           2055       -33     2240
## 3   2013    11    10     1408           1440       -32     1549
## 4   2013     1    11     1900           1930       -30     2233
## 5   2013     1    29     1703           1730       -27     1947
## 6   2013     8     9      729            755       -26     1002
## 7   2013    10    23     1907           1932       -25     2143
## 8   2013     3    30     2030           2055       -25     2213
## 9   2013     3     2     1431           1455       -24     1601
## 10  2013     5     5      934            958       -24     1225
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Data Manipulation: Order

arrange(flights,desc(dep_delay))  #use desc() to specify decending order
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     1     9      641            900      1301     1242
## 2   2013     6    15     1432           1935      1137     1607
## 3   2013     1    10     1121           1635      1126     1239
## 4   2013     9    20     1139           1845      1014     1457
## 5   2013     7    22      845           1600      1005     1044
## 6   2013     4    10     1100           1900       960     1342
## 7   2013     3    17     2321            810       911      135
## 8   2013     6    27      959           1900       899     1236
## 9   2013     7    22     2257            759       898      121
## 10  2013    12     5      756           1700       896     1058
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Data Manipulation: Order

arrange(flights,month,day,desc(dep_delay))  #ordering by multiple attributes
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     1     1      848           1835       853     1001
## 2   2013     1     1     2343           1724       379      314
## 3   2013     1     1     1815           1325       290     2120
## 4   2013     1     1     2205           1720       285       46
## 5   2013     1     1     1842           1422       260     1958
## 6   2013     1     1     2115           1700       255     2330
## 7   2013     1     1     2006           1630       216     2230
## 8   2013     1     1     2312           2000       192       21
## 9   2013     1     1     1942           1705       157     2124
## 10  2013     1     1     1938           1703       155     2109
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Data Manipulation: Select

select(flights, year, month, day, carrier)
## # A tibble: 336,776 × 4
##     year month   day carrier
##    <int> <int> <int>   <chr>
## 1   2013     1     1      UA
## 2   2013     1     1      UA
## 3   2013     1     1      AA
## 4   2013     1     1      B6
## 5   2013     1     1      DL
## 6   2013     1     1      UA
## 7   2013     1     1      B6
## 8   2013     1     1      EV
## 9   2013     1     1      B6
## 10  2013     1     1      AA
## # ... with 336,766 more rows

Data Manipulation: Adding

flights<-mutate(flights,
  speed = distance / air_time * 60
)   # mutate keep the original columns
select(flights, speed)
## # A tibble: 336,776 × 1
##       speed
##       <dbl>
## 1  370.0441
## 2  374.2731
## 3  408.3750
## 4  516.7213
## 5  394.1379
## 6  287.6000
## 7  404.4304
## 8  259.2453
## 9  404.5714
## 10 318.6957
## # ... with 336,766 more rows

Quick Summary

Data Analysis: Grouping and Aggregating

Data Analysis: Grouping and Aggregating

origins<-group_by(flights, origin)
delay_origin_summary<-summarise(origins,  # grouping flights by origins
  avg_delay = mean(dep_delay,na.rm = TRUE), # na.rm = TRUE is set to remove NA values
  flights = n(),   # n() count the number of observations
  sd_delay = sd(dep_delay,na.rm = TRUE),
  max_delay = max(dep_delay,na.rm=TRUE)
)
delay_origin_summary
## # A tibble: 3 × 5
##   origin avg_delay flights sd_delay max_delay
##    <chr>     <dbl>   <int>    <dbl>     <dbl>
## 1    EWR  15.10795  120835 41.32370      1126
## 2    JFK  12.11216  111279 39.03507      1301
## 3    LGA  10.34688  104662 39.99302       911
arrange(delay_origin_summary,desc(avg_delay)) # descending order by avg_delay time
## # A tibble: 3 × 5
##   origin avg_delay flights sd_delay max_delay
##    <chr>     <dbl>   <int>    <dbl>     <dbl>
## 1    EWR  15.10795  120835 41.32370      1126
## 2    JFK  12.11216  111279 39.03507      1301
## 3    LGA  10.34688  104662 39.99302       911

Quick Summary: Grouping and Aggregating

Date Object Manipulation

now<-"2017-08-25 11:04:04"
class(now)
## [1] "character"
now<-ymd_hms(now)
class(now)
## [1] "POSIXct" "POSIXt"
print(now)
## [1] "2017-08-25 11:04:04 UTC"

Date Object Manipulation

year(now)
## [1] 2017
weekdays(now)
## [1] "Friday"
flights$weekday<-weekdays(flights$time_hour)
table(flights$weekday)
## 
##    Friday    Monday  Saturday    Sunday  Thursday   Tuesday Wednesday 
##     50308     50690     38720     46357     50219     50422     50060

Plot with ggplot

g <- ggplot(flights) # plot the count of flights for each carrier
g + geom_bar(aes(carrier))

g <- ggplot(flights) # plot the count of flights for each carrier and group by orgins
g + geom_bar(aes(carrier,fill=origin))  # group the flights by origin for each carrier

g + geom_bar(aes(carrier,fill=origin)) + labs(title ="Flights by Carriers & Airport in New York City")

ggplot2 cheat sheat

Write Reports

Write Reports

In-class exercise

ordered_movies<-arrange(movie_top5000,Rating,Gross_Box,Year,desc(Rating),desc(Gross_Box),Year)
years_grouped<-group_by(movie_top5000,Year)
year_movie_summary<-summarise(years_grouped,count = n(),avg_gross_box=mean(Gross_Box,na.rm=TRUE))
ggplot(year_movie_summary)+geom_line(aes(Year,avg_gross_box))

Quick Summary

Thank You!