Summarizing Data

Import Data

# create a folder for the data
if(!file.exists("./data")){dir.create("./data")}

#Get Data From the Web
fileUrl <-"https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"
download.file(fileUrl, destfile = "./data/restaurants.csv")
restData <-read.csv("./data/restaurants.csv")

###Inspect Data

#Inspect Data
head(restData, n=3)
##    name zipCode neighborhood councilDistrict policeDistrict
## 1   410   21206    Frankford               2   NORTHEASTERN
## 2  1919   21231  Fells Point               1   SOUTHEASTERN
## 3 SAUTE   21224       Canton               1   SOUTHEASTERN
##                        Location.1 X2010.Census.Neighborhoods
## 1 4509 BELAIR ROAD\nBaltimore, MD                         NA
## 2    1919 FLEET ST\nBaltimore, MD                         NA
## 3   2844 HUDSON ST\nBaltimore, MD                         NA
##   X2010.Census.Wards.Precincts Zip.Codes
## 1                           NA        NA
## 2                           NA        NA
## 3                           NA        NA
tail(restData, n=3)
##              name zipCode  neighborhood councilDistrict policeDistrict
## 1325 ZINK'S CAFÂ\220   21213 Belair-Edison              13   NORTHEASTERN
## 1326 ZISSIMOS BAR   21211       Hampden               7       NORTHERN
## 1327       ZORBAS   21224     Greektown               2   SOUTHEASTERN
##                            Location.1 X2010.Census.Neighborhoods
## 1325 3300 LAWNVIEW AVE\nBaltimore, MD                         NA
## 1326      1023 36TH ST\nBaltimore, MD                         NA
## 1327  4710 EASTERN Ave\nBaltimore, MD                         NA
##      X2010.Census.Wards.Precincts Zip.Codes
## 1325                           NA        NA
## 1326                           NA        NA
## 1327                           NA        NA
#get overall summary of variables
summary(restData)
##      name              zipCode       neighborhood       councilDistrict 
##  Length:1327        Min.   :-21226   Length:1327        Min.   : 1.000  
##  Class :character   1st Qu.: 21202   Class :character   1st Qu.: 2.000  
##  Mode  :character   Median : 21218   Mode  :character   Median : 9.000  
##                     Mean   : 21185                      Mean   : 7.191  
##                     3rd Qu.: 21226                      3rd Qu.:11.000  
##                     Max.   : 21287                      Max.   :14.000  
##  policeDistrict      Location.1        X2010.Census.Neighborhoods
##  Length:1327        Length:1327        Mode:logical              
##  Class :character   Class :character   NA's:1327                 
##  Mode  :character   Mode  :character                             
##                                                                  
##                                                                  
##                                                                  
##  X2010.Census.Wards.Precincts Zip.Codes     
##  Mode:logical                 Mode:logical  
##  NA's:1327                    NA's:1327     
##                                             
##                                             
##                                             
## 
#check the type and examples of variables  
str(restData)
## 'data.frame':    1327 obs. of  9 variables:
##  $ name                        : chr  "410" "1919" "SAUTE" "#1 CHINESE KITCHEN" ...
##  $ zipCode                     : int  21206 21231 21224 21211 21223 21218 21205 21211 21205 21231 ...
##  $ neighborhood                : chr  "Frankford" "Fells Point" "Canton" "Hampden" ...
##  $ councilDistrict             : int  2 1 1 14 9 14 13 7 13 1 ...
##  $ policeDistrict              : chr  "NORTHEASTERN" "SOUTHEASTERN" "SOUTHEASTERN" "NORTHERN" ...
##  $ Location.1                  : chr  "4509 BELAIR ROAD\nBaltimore, MD" "1919 FLEET ST\nBaltimore, MD" "2844 HUDSON ST\nBaltimore, MD" "3998 ROLAND AVE\nBaltimore, MD" ...
##  $ X2010.Census.Neighborhoods  : logi  NA NA NA NA NA NA ...
##  $ X2010.Census.Wards.Precincts: logi  NA NA NA NA NA NA ...
##  $ Zip.Codes                   : logi  NA NA NA NA NA NA ...
#look at the the distribution of a feature & remove NAs
quantile(restData$councilDistrict, na.rm = TRUE)
##   0%  25%  50%  75% 100% 
##    1    2    9   11   14
quantile(restData$councilDistrict, probs=c(0.5,0.75,0.9), na.rm = TRUE)
## 50% 75% 90% 
##   9  11  12
#Make a table & list NAs
table(restData$zipCode, useNA = "ifany")
## 
## -21226  21201  21202  21205  21206  21207  21208  21209  21210  21211  21212 
##      1    136    201     27     30      4      1      8     23     41     28 
##  21213  21214  21215  21216  21217  21218  21220  21222  21223  21224  21225 
##     31     17     54     10     32     69      1      7     56    199     19 
##  21226  21227  21229  21230  21231  21234  21237  21239  21251  21287 
##     18      4     13    156    127      7      1      3      2      1
table(restData$councilDistrict, restData$zipCode)
##     
##      -21226 21201 21202 21205 21206 21207 21208 21209 21210 21211 21212 21213
##   1       0     0    37     0     0     0     0     0     0     0     0     2
##   2       0     0     0     3    27     0     0     0     0     0     0     0
##   3       0     0     0     0     0     0     0     0     0     0     0     2
##   4       0     0     0     0     0     0     0     0     0     0    27     0
##   5       0     0     0     0     0     3     0     6     0     0     0     0
##   6       0     0     0     0     0     0     0     1    19     0     0     0
##   7       0     0     0     0     0     0     0     1     0    27     0     0
##   8       0     0     0     0     0     1     0     0     0     0     0     0
##   9       0     1     0     0     0     0     0     0     0     0     0     0
##   10      1     0     1     0     0     0     0     0     0     0     0     0
##   11      0   115   139     0     0     0     1     0     0     0     1     0
##   12      0    20    24     4     0     0     0     0     0     0     0    13
##   13      0     0     0    20     3     0     0     0     0     0     0    13
##   14      0     0     0     0     0     0     0     0     4    14     0     1
##     
##      21214 21215 21216 21217 21218 21220 21222 21223 21224 21225 21226 21227
##   1      0     0     0     0     0     0     7     0   140     1     0     0
##   2      0     0     0     0     0     0     0     0    54     0     0     0
##   3     17     0     0     0     3     0     0     0     0     0     0     1
##   4      0     0     0     0     0     0     0     0     0     0     0     0
##   5      0    31     0     0     0     0     0     0     0     0     0     0
##   6      0    15     1     0     0     0     0     0     0     0     0     0
##   7      0     6     7    15     6     0     0     0     0     0     0     0
##   8      0     0     0     0     0     0     0     2     0     0     0     2
##   9      0     0     2     8     0     0     0    53     0     0     0     0
##   10     0     0     0     0     0     1     0     0     0    18    18     0
##   11     0     0     0     9     0     0     0     1     0     0     0     0
##   12     0     0     0     0    26     0     0     0     0     0     0     0
##   13     0     1     0     0     0     0     0     0     5     0     0     1
##   14     0     1     0     0    34     0     0     0     0     0     0     0
##     
##      21229 21230 21231 21234 21237 21239 21251 21287
##   1      0     1   124     0     0     0     0     0
##   2      0     0     0     0     1     0     0     0
##   3      0     0     0     7     0     0     2     0
##   4      0     0     0     0     0     3     0     0
##   5      0     0     0     0     0     0     0     0
##   6      0     0     0     0     0     0     0     0
##   7      0     0     0     0     0     0     0     0
##   8     13     0     0     0     0     0     0     0
##   9      0    11     0     0     0     0     0     0
##   10     0   133     0     0     0     0     0     0
##   11     0    11     0     0     0     0     0     0
##   12     0     0     2     0     0     0     0     0
##   13     0     0     1     0     0     0     0     1
##   14     0     0     0     0     0     0     0     0
#use sum(), any(), all() ... to find NAs
sum(is.na(restData$councilDistrict))
## [1] 0
any(is.na(restData$councilDistrict))
## [1] FALSE
all(restData$zipCode)
## [1] TRUE
all(colSums(is.na(restData))==0)
## [1] FALSE
#drill into features 
table(restData$zipCode %in% c("21212", "21213"))
## 
## FALSE  TRUE 
##  1268    59

###Subset Data

#use logical statement to subset a data set

SubsetOf <- restData[restData$zipCode %in% c("21212", "21213"), ]

head(SubsetOf)
##                           name zipCode              neighborhood
## 29           BAY ATLANTIC CLUB   21212                  Downtown
## 39                 BERMUDA BAR   21213             Broadway East
## 92                   ATWATER'S   21212 Chinquapin Park-Belvedere
## 111 BALTIMORE ESTONIAN SOCIETY   21213        South Clifton Park
## 187                   CAFE ZEN   21212                  Rosebank
## 220        CERIELLO FINE FOODS   21212 Chinquapin Park-Belvedere
##     councilDistrict policeDistrict                       Location.1
## 29               11        CENTRAL    206 REDWOOD ST\nBaltimore, MD
## 39               12        EASTERN    1801 NORTH AVE\nBaltimore, MD
## 92                4       NORTHERN 529 BELVEDERE AVE\nBaltimore, MD
## 111              12        EASTERN    1932 BELAIR RD\nBaltimore, MD
## 187               4       NORTHERN 438 BELVEDERE AVE\nBaltimore, MD
## 220               4       NORTHERN 529 BELVEDERE AVE\nBaltimore, MD
##     X2010.Census.Neighborhoods X2010.Census.Wards.Precincts Zip.Codes
## 29                          NA                           NA        NA
## 39                          NA                           NA        NA
## 92                          NA                           NA        NA
## 111                         NA                           NA        NA
## 187                         NA                           NA        NA
## 220                         NA                           NA        NA

###Make Cross Tabs

#upload data
data("UCBAdmissions")
DF=as.data.frame(UCBAdmissions)

#create a summary
summary(DF)
##       Admit       Gender   Dept       Freq      
##  Admitted:12   Male  :12   A:4   Min.   :  8.0  
##  Rejected:12   Female:12   B:4   1st Qu.: 80.0  
##                            C:4   Median :170.0  
##                            D:4   Mean   :188.6  
##                            E:4   3rd Qu.:302.5  
##                            F:4   Max.   :512.0
#identify relationships

xt <-xtabs(Freq~Gender + Admit, data =DF)
xt
##         Admit
## Gender   Admitted Rejected
##   Male       1198     1493
##   Female      557     1278

Other Methods

#replicate xtabs
warpbreaks$replicate <-rep(1:9, len =54)

#identify relationships - difficult to see
xt = xtabs(breaks ~.,data = warpbreaks)
xt
## , , replicate = 1
## 
##     tension
## wool  L  M  H
##    A 26 18 36
##    B 27 42 20
## 
## , , replicate = 2
## 
##     tension
## wool  L  M  H
##    A 30 21 21
##    B 14 26 21
## 
## , , replicate = 3
## 
##     tension
## wool  L  M  H
##    A 54 29 24
##    B 29 19 24
## 
## , , replicate = 4
## 
##     tension
## wool  L  M  H
##    A 25 17 18
##    B 19 16 17
## 
## , , replicate = 5
## 
##     tension
## wool  L  M  H
##    A 70 12 10
##    B 29 39 13
## 
## , , replicate = 6
## 
##     tension
## wool  L  M  H
##    A 52 18 43
##    B 31 28 15
## 
## , , replicate = 7
## 
##     tension
## wool  L  M  H
##    A 51 35 28
##    B 41 21 15
## 
## , , replicate = 8
## 
##     tension
## wool  L  M  H
##    A 26 30 15
##    B 20 39 16
## 
## , , replicate = 9
## 
##     tension
## wool  L  M  H
##    A 67 36 26
##    B 44 29 28
# convert this to a flat table - easy to see
ftable(xt)
##              replicate  1  2  3  4  5  6  7  8  9
## wool tension                                     
## A    L                 26 30 54 25 70 52 51 26 67
##      M                 18 21 29 17 12 18 35 30 36
##      H                 36 21 24 18 10 43 28 15 26
## B    L                 27 14 29 19 29 31 41 20 44
##      M                 42 26 19 16 39 28 21 39 29
##      H                 20 21 24 17 13 15 15 16 28

###See the size of data

fakeData = rnorm(1e5)
object.size(fakeData)
## 800048 bytes
print(object.size(fakeData), units="Mb")
## 0.8 Mb

This is an R Markdown document, feel free to reach out for finer details.