Data Manipulation in R ( Base R/ dplyr)
- selecting or filtering rows
- selecting columns
- creating new columns
- arranging and ordering
- aggregating and summarising
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
crime.by.state <- read.csv("CrimeStatebyState.csv")
names(crime.by.state)
## [1] "State" "Type.of.Crime" "Crime" "Year"
## [5] "Count"
dim(crime.by.state)
## [1] 16422 5
summary(crime.by.state)
## State Type.of.Crime
## Alabama : 322 Property Crime:7038
## Alaska : 322 Violent Crime :9384
## Arizona : 322
## Arkansas : 322
## California: 322
## Colorado : 322
## (Other) :14490
## Crime Year
## Aggravated assault :2346 Min. :1960
## Burglary :2346 1st Qu.:1971
## Forcible rape :2346 Median :1982
## Larceny-theft :2346 Mean :1982
## Motor vehicle theft :2346 3rd Qu.:1994
## Murder and nonnegligent Manslaughter:2346 Max. :2005
## Robbery :2346
## Count
## Min. : 0
## 1st Qu.: 669
## Median : 4620
## Mean : 29606
## 3rd Qu.: 24114
## Max. :986120
##
# df[r,c]
Using Base R
Filtering rows
crime.ny.2005 <- crime.by.state[crime.by.state$Year==2005 &
crime.by.state$State=="New York", ]
# Alternate using subset()
crime.ny.2005 <- subset(crime.by.state, crime.by.state$Year==2005 &
crime.by.state$State=="New York")
Arranging and ordering
crime.ny.2005 <- crime.ny.2005[order(crime.ny.2005$Count,
decreasing=TRUE), ]
Selecting columns
crime.ny.2005 <- crime.ny.2005[, c("Type.of.Crime", "Count")]
Creating new columns
crime.ny.2005$Proportion <- crime.ny.2005$Count /
sum(crime.ny.2005$Count)
Aggregation and summarization
summary1 <- aggregate(Count ~ Type.of.Crime,
data=crime.ny.2005,
FUN=sum)
summary2 <- aggregate(Count ~ Type.of.Crime,
data=crime.ny.2005,
FUN=length)
summary.crime.ny.2005 <- merge(summary1, summary2,
by="Type.of.Crime")
Using dplyr package
Filtering rows
crime.ny.2005 <- filter(crime.by.state, State=="New York", Year==2005)
## Warning: package 'bindrcpp' was built under R version 3.4.2
Arranging and ordering
crime.ny.2005 <- arrange(crime.ny.2005, desc(Count))
Selecting columns
crime.ny.2005 <- select(crime.ny.2005, Type.of.Crime, Count)
Creating new columns
crime.ny.2005 <- mutate(crime.ny.2005, Proportion=Count/sum(Count))
Aggregation and summarization
by.type <- group_by(crime.ny.2005, Type.of.Crime)
summary.crime.ny.2005 <- summarise(by.type,
num.types = n(),
counts = sum(Count))
Chaining Rule
final<-mutate(
arrange(
filter(crime.by.state, State=="New York", Year==2005),
desc(Count)
),
Proportion=Count/sum(Count)
)
Piping or using pipe operator
final.piped<-crime.by.state %>%
filter(State=="New York", Year==2005) %>%
arrange(desc(Count)) %>%
mutate(Proportion=Count/sum(Count))