restData <- read.csv("./data/restaurants.csv", encoding = "UTF-8")

Look at a bit of the data

head(restData,n=3)
##    name zipCode neighborhood councilDistrict policeDistrict
## 1   410   21206    Frankford               2   NORTHEASTERN
## 2  1919   21231  Fells Point               1   SOUTHEASTERN
## 3 SAUTE   21224       Canton               1   SOUTHEASTERN
##                          Location.1
## 1 4509 BELAIR ROAD\nBaltimore, MD\n
## 2    1919 FLEET ST\nBaltimore, MD\n
## 3   2844 HUDSON ST\nBaltimore, MD\n
tail(restData,n=3)
##                    name zipCode  neighborhood councilDistrict
## 1325 ZINK'S CAF<U+0090>   21213 Belair-Edison              13
## 1326       ZISSIMOS BAR   21211       Hampden               7
## 1327             ZORBAS   21224     Greektown               2
##      policeDistrict                         Location.1
## 1325   NORTHEASTERN 3300 LAWNVIEW AVE\nBaltimore, MD\n
## 1326       NORTHERN      1023 36TH ST\nBaltimore, MD\n
## 1327   SOUTHEASTERN  4710 EASTERN Ave\nBaltimore, MD\n

Make summary

summary(restData)
##                            name         zipCode             neighborhood
##  MCDONALD'S                  :   8   Min.   :-21226   Downtown    :128  
##  POPEYES FAMOUS FRIED CHICKEN:   7   1st Qu.: 21202   Fells Point : 91  
##  SUBWAY                      :   6   Median : 21218   Inner Harbor: 89  
##  KENTUCKY FRIED CHICKEN      :   5   Mean   : 21185   Canton      : 81  
##  BURGER KING                 :   4   3rd Qu.: 21226   Federal Hill: 42  
##  DUNKIN DONUTS               :   4   Max.   : 21287   Mount Vernon: 33  
##  (Other)                     :1293                    (Other)     :863  
##  councilDistrict       policeDistrict
##  Min.   : 1.000   SOUTHEASTERN:385   
##  1st Qu.: 2.000   CENTRAL     :288   
##  Median : 9.000   SOUTHERN    :213   
##  Mean   : 7.191   NORTHERN    :157   
##  3rd Qu.:11.000   NORTHEASTERN: 72   
##  Max.   :14.000   EASTERN     : 67   
##                   (Other)     :145   
##                           Location.1    
##  1101 RUSSELL ST\nBaltimore, MD\n:   9  
##  201 PRATT ST\nBaltimore, MD\n   :   8  
##  2400 BOSTON ST\nBaltimore, MD\n :   8  
##  300 LIGHT ST\nBaltimore, MD\n   :   5  
##  300 CHARLES ST\nBaltimore, MD\n :   4  
##  301 LIGHT ST\nBaltimore, MD\n   :   4  
##  (Other)                         :1289

More in depth information

str(restData)
## 'data.frame':    1327 obs. of  6 variables:
##  $ name           : Factor w/ 1277 levels "#1 CHINESE KITCHEN",..: 9 3 992 1 2 4 5 6 7 8 ...
##  $ zipCode        : int  21206 21231 21224 21211 21223 21218 21205 21211 21205 21231 ...
##  $ neighborhood   : Factor w/ 173 levels "Abell","Arlington",..: 53 52 18 66 104 33 98 133 98 157 ...
##  $ councilDistrict: int  2 1 1 14 9 14 13 7 13 1 ...
##  $ policeDistrict : Factor w/ 9 levels "CENTRAL","EASTERN",..: 3 6 6 4 8 3 6 4 6 6 ...
##  $ Location.1     : Factor w/ 1210 levels "1 BIDDLE ST\nBaltimore, MD\n",..: 835 334 554 755 492 537 505 530 507 569 ...

Quantiles of quantitative variables

quantile(restData$councilDistrict,na.rm=TRUE)
##   0%  25%  50%  75% 100% 
##    1    2    9   11   14
quantile(restData$councilDistrict,probs=c(0.5,0.75,0.9))
## 50% 75% 90% 
##   9  11  12

Make table

table(restData$zipCode,useNA="ifany")
## 
## -21226  21201  21202  21205  21206  21207  21208  21209  21210  21211 
##      1    136    201     27     30      4      1      8     23     41 
##  21212  21213  21214  21215  21216  21217  21218  21220  21222  21223 
##     28     31     17     54     10     32     69      1      7     56 
##  21224  21225  21226  21227  21229  21230  21231  21234  21237  21239 
##    199     19     18      4     13    156    127      7      1      3 
##  21251  21287 
##      2      1

Make table

table(restData$councilDistrict,restData$zipCode)
##     
##      -21226 21201 21202 21205 21206 21207 21208 21209 21210 21211 21212
##   1       0     0    37     0     0     0     0     0     0     0     0
##   2       0     0     0     3    27     0     0     0     0     0     0
##   3       0     0     0     0     0     0     0     0     0     0     0
##   4       0     0     0     0     0     0     0     0     0     0    27
##   5       0     0     0     0     0     3     0     6     0     0     0
##   6       0     0     0     0     0     0     0     1    19     0     0
##   7       0     0     0     0     0     0     0     1     0    27     0
##   8       0     0     0     0     0     1     0     0     0     0     0
##   9       0     1     0     0     0     0     0     0     0     0     0
##   10      1     0     1     0     0     0     0     0     0     0     0
##   11      0   115   139     0     0     0     1     0     0     0     1
##   12      0    20    24     4     0     0     0     0     0     0     0
##   13      0     0     0    20     3     0     0     0     0     0     0
##   14      0     0     0     0     0     0     0     0     4    14     0
##     
##      21213 21214 21215 21216 21217 21218 21220 21222 21223 21224 21225
##   1      2     0     0     0     0     0     0     7     0   140     1
##   2      0     0     0     0     0     0     0     0     0    54     0
##   3      2    17     0     0     0     3     0     0     0     0     0
##   4      0     0     0     0     0     0     0     0     0     0     0
##   5      0     0    31     0     0     0     0     0     0     0     0
##   6      0     0    15     1     0     0     0     0     0     0     0
##   7      0     0     6     7    15     6     0     0     0     0     0
##   8      0     0     0     0     0     0     0     0     2     0     0
##   9      0     0     0     2     8     0     0     0    53     0     0
##   10     0     0     0     0     0     0     1     0     0     0    18
##   11     0     0     0     0     9     0     0     0     1     0     0
##   12    13     0     0     0     0    26     0     0     0     0     0
##   13    13     0     1     0     0     0     0     0     0     5     0
##   14     1     0     1     0     0    34     0     0     0     0     0
##     
##      21226 21227 21229 21230 21231 21234 21237 21239 21251 21287
##   1      0     0     0     1   124     0     0     0     0     0
##   2      0     0     0     0     0     0     1     0     0     0
##   3      0     1     0     0     0     7     0     0     2     0
##   4      0     0     0     0     0     0     0     3     0     0
##   5      0     0     0     0     0     0     0     0     0     0
##   6      0     0     0     0     0     0     0     0     0     0
##   7      0     0     0     0     0     0     0     0     0     0
##   8      0     2    13     0     0     0     0     0     0     0
##   9      0     0     0    11     0     0     0     0     0     0
##   10    18     0     0   133     0     0     0     0     0     0
##   11     0     0     0    11     0     0     0     0     0     0
##   12     0     0     0     0     2     0     0     0     0     0
##   13     0     1     0     0     1     0     0     0     0     1
##   14     0     0     0     0     0     0     0     0     0     0

Check for missing values

sum(is.na(restData$councilDistrict))
## [1] 0
any(is.na(restData$councilDistrict))
## [1] FALSE
all(restData$zipCode > 0)
## [1] FALSE

Row and column sums

colSums(is.na(restData))
##            name         zipCode    neighborhood councilDistrict 
##               0               0               0               0 
##  policeDistrict      Location.1 
##               0               0
all(colSums(is.na(restData))==0)
## [1] TRUE

Values with specific characteristics

table(restData$zipCode %in% c("21212"))
## 
## FALSE  TRUE 
##  1299    28
table(restData$zipCode %in% c("21212","21213"))
## 
## FALSE  TRUE 
##  1268    59

Values with specific characteristics

head(restData[restData$zipCode %in% c("21212","21213"),], 2)
##                 name zipCode  neighborhood councilDistrict policeDistrict
## 29 BAY ATLANTIC CLUB   21212      Downtown              11        CENTRAL
## 39       BERMUDA BAR   21213 Broadway East              12        EASTERN
##                         Location.1
## 29 206 REDWOOD ST\nBaltimore, MD\n
## 39 1801 NORTH AVE\nBaltimore, MD\n

Cross tabs

data(UCBAdmissions)
DF = as.data.frame(UCBAdmissions)
summary(DF)
##       Admit       Gender   Dept       Freq      
##  Admitted:12   Male  :12   A:4   Min.   :  8.0  
##  Rejected:12   Female:12   B:4   1st Qu.: 80.0  
##                            C:4   Median :170.0  
##                            D:4   Mean   :188.6  
##                            E:4   3rd Qu.:302.5  
##                            F:4   Max.   :512.0

Cross tabs

xt <- xtabs(Freq ~ Gender + Admit,data=DF)
xt
##         Admit
## Gender   Admitted Rejected
##   Male       1198     1493
##   Female      557     1278

Flat tables

warpbreaks$replicate <- rep(1:9, len = 54)
xt = xtabs(breaks ~.,data=warpbreaks)
xt
## , , replicate = 1
## 
##     tension
## wool  L  M  H
##    A 26 18 36
##    B 27 42 20
## 
## , , replicate = 2
## 
##     tension
## wool  L  M  H
##    A 30 21 21
##    B 14 26 21
## 
## , , replicate = 3
## 
##     tension
## wool  L  M  H
##    A 54 29 24
##    B 29 19 24
## 
## , , replicate = 4
## 
##     tension
## wool  L  M  H
##    A 25 17 18
##    B 19 16 17
## 
## , , replicate = 5
## 
##     tension
## wool  L  M  H
##    A 70 12 10
##    B 29 39 13
## 
## , , replicate = 6
## 
##     tension
## wool  L  M  H
##    A 52 18 43
##    B 31 28 15
## 
## , , replicate = 7
## 
##     tension
## wool  L  M  H
##    A 51 35 28
##    B 41 21 15
## 
## , , replicate = 8
## 
##     tension
## wool  L  M  H
##    A 26 30 15
##    B 20 39 16
## 
## , , replicate = 9
## 
##     tension
## wool  L  M  H
##    A 67 36 26
##    B 44 29 28

Flat tables

ftable(xt)
##              replicate  1  2  3  4  5  6  7  8  9
## wool tension                                     
## A    L                 26 30 54 25 70 52 51 26 67
##      M                 18 21 29 17 12 18 35 30 36
##      H                 36 21 24 18 10 43 28 15 26
## B    L                 27 14 29 19 29 31 41 20 44
##      M                 42 26 19 16 39 28 21 39 29
##      H                 20 21 24 17 13 15 15 16 28

Size of a data set

fakeData = rnorm(1e5)
object.size(fakeData)
## 800040 bytes
print(object.size(fakeData),units="Mb")
## 0.8 Mb

operation

Built-in Functions