R Demo Class

Data Manipulation in R ( Base R/ dplyr)

selecting or filtering rows
selecting columns
creating new columns
arranging and ordering
aggregating and summarising

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.4.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

crime.by.state <- read.csv("CrimeStatebyState.csv")
names(crime.by.state)

## [1] "State"         "Type.of.Crime" "Crime"         "Year"         
## [5] "Count"

dim(crime.by.state)

## [1] 16422     5

summary(crime.by.state)

##         State              Type.of.Crime 
##  Alabama   :  322   Property Crime:7038  
##  Alaska    :  322   Violent Crime :9384  
##  Arizona   :  322                        
##  Arkansas  :  322                        
##  California:  322                        
##  Colorado  :  322                        
##  (Other)   :14490                        
##                                   Crime           Year     
##  Aggravated assault                  :2346   Min.   :1960  
##  Burglary                            :2346   1st Qu.:1971  
##  Forcible rape                       :2346   Median :1982  
##  Larceny-theft                       :2346   Mean   :1982  
##  Motor vehicle theft                 :2346   3rd Qu.:1994  
##  Murder and nonnegligent Manslaughter:2346   Max.   :2005  
##  Robbery                             :2346                 
##      Count       
##  Min.   :     0  
##  1st Qu.:   669  
##  Median :  4620  
##  Mean   : 29606  
##  3rd Qu.: 24114  
##  Max.   :986120  
##

# df[r,c]

Using Base R

Filtering rows

crime.ny.2005 <- crime.by.state[crime.by.state$Year==2005 &
                                crime.by.state$State=="New York", ]
# Alternate using subset()
crime.ny.2005 <- subset(crime.by.state, crime.by.state$Year==2005 &
                                crime.by.state$State=="New York")

Arranging and ordering

crime.ny.2005 <- crime.ny.2005[order(crime.ny.2005$Count,
                                     decreasing=TRUE), ]

Selecting columns

crime.ny.2005 <- crime.ny.2005[, c("Type.of.Crime", "Count")]

Creating new columns

crime.ny.2005$Proportion <- crime.ny.2005$Count /
                            sum(crime.ny.2005$Count)

Aggregation and summarization

summary1 <- aggregate(Count ~ Type.of.Crime,
                      data=crime.ny.2005,
                      FUN=sum)
summary2 <- aggregate(Count ~ Type.of.Crime,
                      data=crime.ny.2005,
                      FUN=length)
summary.crime.ny.2005 <- merge(summary1, summary2,
                               by="Type.of.Crime")

Using dplyr package

Filtering rows

crime.ny.2005 <- filter(crime.by.state, State=="New York", Year==2005)

## Warning: package 'bindrcpp' was built under R version 3.4.2

Arranging and ordering

crime.ny.2005 <- arrange(crime.ny.2005, desc(Count))

Selecting columns

crime.ny.2005 <- select(crime.ny.2005, Type.of.Crime, Count)

Creating new columns

crime.ny.2005 <- mutate(crime.ny.2005, Proportion=Count/sum(Count))

Aggregation and summarization

by.type <- group_by(crime.ny.2005, Type.of.Crime)
summary.crime.ny.2005 <- summarise(by.type,
                                   num.types = n(),
                                   counts = sum(Count))

Chaining Rule

final<-mutate(
              arrange(
                      filter(crime.by.state, State=="New York", Year==2005),
                      desc(Count)
                      ),
                      Proportion=Count/sum(Count)
              )

Piping or using pipe operator

final.piped<-crime.by.state %>%
           filter(State=="New York", Year==2005) %>%
           arrange(desc(Count)) %>%
           mutate(Proportion=Count/sum(Count))

R Demo Class

Priyanka Gagneja

November 7, 2018

Data Manipulation in R ( Base R/ dplyr)

Using Base R

Filtering rows

Arranging and ordering

Selecting columns

Creating new columns

Aggregation and summarization

Using dplyr package

Filtering rows

Arranging and ordering

Selecting columns

Creating new columns

Aggregation and summarization

Chaining Rule

Piping or using pipe operator