
restData <- read.csv("./data/restaurants.csv", encoding = "UTF-8")
Look at a bit of the data
head(restData,n=3)
## name zipCode neighborhood councilDistrict policeDistrict
## 1 410 21206 Frankford 2 NORTHEASTERN
## 2 1919 21231 Fells Point 1 SOUTHEASTERN
## 3 SAUTE 21224 Canton 1 SOUTHEASTERN
## Location.1
## 1 4509 BELAIR ROAD\nBaltimore, MD\n
## 2 1919 FLEET ST\nBaltimore, MD\n
## 3 2844 HUDSON ST\nBaltimore, MD\n
tail(restData,n=3)
## name zipCode neighborhood councilDistrict
## 1325 ZINK'S CAF<U+0090> 21213 Belair-Edison 13
## 1326 ZISSIMOS BAR 21211 Hampden 7
## 1327 ZORBAS 21224 Greektown 2
## policeDistrict Location.1
## 1325 NORTHEASTERN 3300 LAWNVIEW AVE\nBaltimore, MD\n
## 1326 NORTHERN 1023 36TH ST\nBaltimore, MD\n
## 1327 SOUTHEASTERN 4710 EASTERN Ave\nBaltimore, MD\n
Make summary
summary(restData)
## name zipCode neighborhood
## MCDONALD'S : 8 Min. :-21226 Downtown :128
## POPEYES FAMOUS FRIED CHICKEN: 7 1st Qu.: 21202 Fells Point : 91
## SUBWAY : 6 Median : 21218 Inner Harbor: 89
## KENTUCKY FRIED CHICKEN : 5 Mean : 21185 Canton : 81
## BURGER KING : 4 3rd Qu.: 21226 Federal Hill: 42
## DUNKIN DONUTS : 4 Max. : 21287 Mount Vernon: 33
## (Other) :1293 (Other) :863
## councilDistrict policeDistrict
## Min. : 1.000 SOUTHEASTERN:385
## 1st Qu.: 2.000 CENTRAL :288
## Median : 9.000 SOUTHERN :213
## Mean : 7.191 NORTHERN :157
## 3rd Qu.:11.000 NORTHEASTERN: 72
## Max. :14.000 EASTERN : 67
## (Other) :145
## Location.1
## 1101 RUSSELL ST\nBaltimore, MD\n: 9
## 201 PRATT ST\nBaltimore, MD\n : 8
## 2400 BOSTON ST\nBaltimore, MD\n : 8
## 300 LIGHT ST\nBaltimore, MD\n : 5
## 300 CHARLES ST\nBaltimore, MD\n : 4
## 301 LIGHT ST\nBaltimore, MD\n : 4
## (Other) :1289
Quantiles of quantitative variables
quantile(restData$councilDistrict,na.rm=TRUE)
## 0% 25% 50% 75% 100%
## 1 2 9 11 14
quantile(restData$councilDistrict,probs=c(0.5,0.75,0.9))
## 50% 75% 90%
## 9 11 12
Make table
table(restData$zipCode,useNA="ifany")
##
## -21226 21201 21202 21205 21206 21207 21208 21209 21210 21211
## 1 136 201 27 30 4 1 8 23 41
## 21212 21213 21214 21215 21216 21217 21218 21220 21222 21223
## 28 31 17 54 10 32 69 1 7 56
## 21224 21225 21226 21227 21229 21230 21231 21234 21237 21239
## 199 19 18 4 13 156 127 7 1 3
## 21251 21287
## 2 1
Make table
table(restData$councilDistrict,restData$zipCode)
##
## -21226 21201 21202 21205 21206 21207 21208 21209 21210 21211 21212
## 1 0 0 37 0 0 0 0 0 0 0 0
## 2 0 0 0 3 27 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 27
## 5 0 0 0 0 0 3 0 6 0 0 0
## 6 0 0 0 0 0 0 0 1 19 0 0
## 7 0 0 0 0 0 0 0 1 0 27 0
## 8 0 0 0 0 0 1 0 0 0 0 0
## 9 0 1 0 0 0 0 0 0 0 0 0
## 10 1 0 1 0 0 0 0 0 0 0 0
## 11 0 115 139 0 0 0 1 0 0 0 1
## 12 0 20 24 4 0 0 0 0 0 0 0
## 13 0 0 0 20 3 0 0 0 0 0 0
## 14 0 0 0 0 0 0 0 0 4 14 0
##
## 21213 21214 21215 21216 21217 21218 21220 21222 21223 21224 21225
## 1 2 0 0 0 0 0 0 7 0 140 1
## 2 0 0 0 0 0 0 0 0 0 54 0
## 3 2 17 0 0 0 3 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 31 0 0 0 0 0 0 0 0
## 6 0 0 15 1 0 0 0 0 0 0 0
## 7 0 0 6 7 15 6 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0 2 0 0
## 9 0 0 0 2 8 0 0 0 53 0 0
## 10 0 0 0 0 0 0 1 0 0 0 18
## 11 0 0 0 0 9 0 0 0 1 0 0
## 12 13 0 0 0 0 26 0 0 0 0 0
## 13 13 0 1 0 0 0 0 0 0 5 0
## 14 1 0 1 0 0 34 0 0 0 0 0
##
## 21226 21227 21229 21230 21231 21234 21237 21239 21251 21287
## 1 0 0 0 1 124 0 0 0 0 0
## 2 0 0 0 0 0 0 1 0 0 0
## 3 0 1 0 0 0 7 0 0 2 0
## 4 0 0 0 0 0 0 0 3 0 0
## 5 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0 0 0 0
## 8 0 2 13 0 0 0 0 0 0 0
## 9 0 0 0 11 0 0 0 0 0 0
## 10 18 0 0 133 0 0 0 0 0 0
## 11 0 0 0 11 0 0 0 0 0 0
## 12 0 0 0 0 2 0 0 0 0 0
## 13 0 1 0 0 1 0 0 0 0 1
## 14 0 0 0 0 0 0 0 0 0 0
Check for missing values
sum(is.na(restData$councilDistrict))
## [1] 0
any(is.na(restData$councilDistrict))
## [1] FALSE
all(restData$zipCode > 0)
## [1] FALSE
Row and column sums
colSums(is.na(restData))
## name zipCode neighborhood councilDistrict
## 0 0 0 0
## policeDistrict Location.1
## 0 0
all(colSums(is.na(restData))==0)
## [1] TRUE
Values with specific characteristics
table(restData$zipCode %in% c("21212"))
##
## FALSE TRUE
## 1299 28
table(restData$zipCode %in% c("21212","21213"))
##
## FALSE TRUE
## 1268 59
Values with specific characteristics
head(restData[restData$zipCode %in% c("21212","21213"),], 2)
## name zipCode neighborhood councilDistrict policeDistrict
## 29 BAY ATLANTIC CLUB 21212 Downtown 11 CENTRAL
## 39 BERMUDA BAR 21213 Broadway East 12 EASTERN
## Location.1
## 29 206 REDWOOD ST\nBaltimore, MD\n
## 39 1801 NORTH AVE\nBaltimore, MD\n
Cross tabs
data(UCBAdmissions)
DF = as.data.frame(UCBAdmissions)
summary(DF)
## Admit Gender Dept Freq
## Admitted:12 Male :12 A:4 Min. : 8.0
## Rejected:12 Female:12 B:4 1st Qu.: 80.0
## C:4 Median :170.0
## D:4 Mean :188.6
## E:4 3rd Qu.:302.5
## F:4 Max. :512.0
Cross tabs
xt <- xtabs(Freq ~ Gender + Admit,data=DF)
xt
## Admit
## Gender Admitted Rejected
## Male 1198 1493
## Female 557 1278
Flat tables
warpbreaks$replicate <- rep(1:9, len = 54)
xt = xtabs(breaks ~.,data=warpbreaks)
xt
## , , replicate = 1
##
## tension
## wool L M H
## A 26 18 36
## B 27 42 20
##
## , , replicate = 2
##
## tension
## wool L M H
## A 30 21 21
## B 14 26 21
##
## , , replicate = 3
##
## tension
## wool L M H
## A 54 29 24
## B 29 19 24
##
## , , replicate = 4
##
## tension
## wool L M H
## A 25 17 18
## B 19 16 17
##
## , , replicate = 5
##
## tension
## wool L M H
## A 70 12 10
## B 29 39 13
##
## , , replicate = 6
##
## tension
## wool L M H
## A 52 18 43
## B 31 28 15
##
## , , replicate = 7
##
## tension
## wool L M H
## A 51 35 28
## B 41 21 15
##
## , , replicate = 8
##
## tension
## wool L M H
## A 26 30 15
## B 20 39 16
##
## , , replicate = 9
##
## tension
## wool L M H
## A 67 36 26
## B 44 29 28
Flat tables
ftable(xt)
## replicate 1 2 3 4 5 6 7 8 9
## wool tension
## A L 26 30 54 25 70 52 51 26 67
## M 18 21 29 17 12 18 35 30 36
## H 36 21 24 18 10 43 28 15 26
## B L 27 14 29 19 29 31 41 20 44
## M 42 26 19 16 39 28 21 39 29
## H 20 21 24 17 13 15 15 16 28
Size of a data set
fakeData = rnorm(1e5)
object.size(fakeData)
## 800040 bytes
print(object.size(fakeData),units="Mb")
## 0.8 Mb
operation
