# create a folder for the data
if(!file.exists("./data")){dir.create("./data")}
#Get Data From the Web
fileUrl <-"https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"
download.file(fileUrl, destfile = "./data/restaurants.csv")
restData <-read.csv("./data/restaurants.csv")
###Inspect Data
#Inspect Data
head(restData, n=3)
## name zipCode neighborhood councilDistrict policeDistrict
## 1 410 21206 Frankford 2 NORTHEASTERN
## 2 1919 21231 Fells Point 1 SOUTHEASTERN
## 3 SAUTE 21224 Canton 1 SOUTHEASTERN
## Location.1 X2010.Census.Neighborhoods
## 1 4509 BELAIR ROAD\nBaltimore, MD NA
## 2 1919 FLEET ST\nBaltimore, MD NA
## 3 2844 HUDSON ST\nBaltimore, MD NA
## X2010.Census.Wards.Precincts Zip.Codes
## 1 NA NA
## 2 NA NA
## 3 NA NA
tail(restData, n=3)
## name zipCode neighborhood councilDistrict policeDistrict
## 1325 ZINK'S CAFÂ\220 21213 Belair-Edison 13 NORTHEASTERN
## 1326 ZISSIMOS BAR 21211 Hampden 7 NORTHERN
## 1327 ZORBAS 21224 Greektown 2 SOUTHEASTERN
## Location.1 X2010.Census.Neighborhoods
## 1325 3300 LAWNVIEW AVE\nBaltimore, MD NA
## 1326 1023 36TH ST\nBaltimore, MD NA
## 1327 4710 EASTERN Ave\nBaltimore, MD NA
## X2010.Census.Wards.Precincts Zip.Codes
## 1325 NA NA
## 1326 NA NA
## 1327 NA NA
#get overall summary of variables
summary(restData)
## name zipCode neighborhood councilDistrict
## Length:1327 Min. :-21226 Length:1327 Min. : 1.000
## Class :character 1st Qu.: 21202 Class :character 1st Qu.: 2.000
## Mode :character Median : 21218 Mode :character Median : 9.000
## Mean : 21185 Mean : 7.191
## 3rd Qu.: 21226 3rd Qu.:11.000
## Max. : 21287 Max. :14.000
## policeDistrict Location.1 X2010.Census.Neighborhoods
## Length:1327 Length:1327 Mode:logical
## Class :character Class :character NA's:1327
## Mode :character Mode :character
##
##
##
## X2010.Census.Wards.Precincts Zip.Codes
## Mode:logical Mode:logical
## NA's:1327 NA's:1327
##
##
##
##
#check the type and examples of variables
str(restData)
## 'data.frame': 1327 obs. of 9 variables:
## $ name : chr "410" "1919" "SAUTE" "#1 CHINESE KITCHEN" ...
## $ zipCode : int 21206 21231 21224 21211 21223 21218 21205 21211 21205 21231 ...
## $ neighborhood : chr "Frankford" "Fells Point" "Canton" "Hampden" ...
## $ councilDistrict : int 2 1 1 14 9 14 13 7 13 1 ...
## $ policeDistrict : chr "NORTHEASTERN" "SOUTHEASTERN" "SOUTHEASTERN" "NORTHERN" ...
## $ Location.1 : chr "4509 BELAIR ROAD\nBaltimore, MD" "1919 FLEET ST\nBaltimore, MD" "2844 HUDSON ST\nBaltimore, MD" "3998 ROLAND AVE\nBaltimore, MD" ...
## $ X2010.Census.Neighborhoods : logi NA NA NA NA NA NA ...
## $ X2010.Census.Wards.Precincts: logi NA NA NA NA NA NA ...
## $ Zip.Codes : logi NA NA NA NA NA NA ...
#look at the the distribution of a feature & remove NAs
quantile(restData$councilDistrict, na.rm = TRUE)
## 0% 25% 50% 75% 100%
## 1 2 9 11 14
quantile(restData$councilDistrict, probs=c(0.5,0.75,0.9), na.rm = TRUE)
## 50% 75% 90%
## 9 11 12
#Make a table & list NAs
table(restData$zipCode, useNA = "ifany")
##
## -21226 21201 21202 21205 21206 21207 21208 21209 21210 21211 21212
## 1 136 201 27 30 4 1 8 23 41 28
## 21213 21214 21215 21216 21217 21218 21220 21222 21223 21224 21225
## 31 17 54 10 32 69 1 7 56 199 19
## 21226 21227 21229 21230 21231 21234 21237 21239 21251 21287
## 18 4 13 156 127 7 1 3 2 1
table(restData$councilDistrict, restData$zipCode)
##
## -21226 21201 21202 21205 21206 21207 21208 21209 21210 21211 21212 21213
## 1 0 0 37 0 0 0 0 0 0 0 0 2
## 2 0 0 0 3 27 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 2
## 4 0 0 0 0 0 0 0 0 0 0 27 0
## 5 0 0 0 0 0 3 0 6 0 0 0 0
## 6 0 0 0 0 0 0 0 1 19 0 0 0
## 7 0 0 0 0 0 0 0 1 0 27 0 0
## 8 0 0 0 0 0 1 0 0 0 0 0 0
## 9 0 1 0 0 0 0 0 0 0 0 0 0
## 10 1 0 1 0 0 0 0 0 0 0 0 0
## 11 0 115 139 0 0 0 1 0 0 0 1 0
## 12 0 20 24 4 0 0 0 0 0 0 0 13
## 13 0 0 0 20 3 0 0 0 0 0 0 13
## 14 0 0 0 0 0 0 0 0 4 14 0 1
##
## 21214 21215 21216 21217 21218 21220 21222 21223 21224 21225 21226 21227
## 1 0 0 0 0 0 0 7 0 140 1 0 0
## 2 0 0 0 0 0 0 0 0 54 0 0 0
## 3 17 0 0 0 3 0 0 0 0 0 0 1
## 4 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 31 0 0 0 0 0 0 0 0 0 0
## 6 0 15 1 0 0 0 0 0 0 0 0 0
## 7 0 6 7 15 6 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 2 0 0 0 2
## 9 0 0 2 8 0 0 0 53 0 0 0 0
## 10 0 0 0 0 0 1 0 0 0 18 18 0
## 11 0 0 0 9 0 0 0 1 0 0 0 0
## 12 0 0 0 0 26 0 0 0 0 0 0 0
## 13 0 1 0 0 0 0 0 0 5 0 0 1
## 14 0 1 0 0 34 0 0 0 0 0 0 0
##
## 21229 21230 21231 21234 21237 21239 21251 21287
## 1 0 1 124 0 0 0 0 0
## 2 0 0 0 0 1 0 0 0
## 3 0 0 0 7 0 0 2 0
## 4 0 0 0 0 0 3 0 0
## 5 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0 0
## 8 13 0 0 0 0 0 0 0
## 9 0 11 0 0 0 0 0 0
## 10 0 133 0 0 0 0 0 0
## 11 0 11 0 0 0 0 0 0
## 12 0 0 2 0 0 0 0 0
## 13 0 0 1 0 0 0 0 1
## 14 0 0 0 0 0 0 0 0
#use sum(), any(), all() ... to find NAs
sum(is.na(restData$councilDistrict))
## [1] 0
any(is.na(restData$councilDistrict))
## [1] FALSE
all(restData$zipCode)
## [1] TRUE
all(colSums(is.na(restData))==0)
## [1] FALSE
#drill into features
table(restData$zipCode %in% c("21212", "21213"))
##
## FALSE TRUE
## 1268 59
###Subset Data
#use logical statement to subset a data set
SubsetOf <- restData[restData$zipCode %in% c("21212", "21213"), ]
head(SubsetOf)
## name zipCode neighborhood
## 29 BAY ATLANTIC CLUB 21212 Downtown
## 39 BERMUDA BAR 21213 Broadway East
## 92 ATWATER'S 21212 Chinquapin Park-Belvedere
## 111 BALTIMORE ESTONIAN SOCIETY 21213 South Clifton Park
## 187 CAFE ZEN 21212 Rosebank
## 220 CERIELLO FINE FOODS 21212 Chinquapin Park-Belvedere
## councilDistrict policeDistrict Location.1
## 29 11 CENTRAL 206 REDWOOD ST\nBaltimore, MD
## 39 12 EASTERN 1801 NORTH AVE\nBaltimore, MD
## 92 4 NORTHERN 529 BELVEDERE AVE\nBaltimore, MD
## 111 12 EASTERN 1932 BELAIR RD\nBaltimore, MD
## 187 4 NORTHERN 438 BELVEDERE AVE\nBaltimore, MD
## 220 4 NORTHERN 529 BELVEDERE AVE\nBaltimore, MD
## X2010.Census.Neighborhoods X2010.Census.Wards.Precincts Zip.Codes
## 29 NA NA NA
## 39 NA NA NA
## 92 NA NA NA
## 111 NA NA NA
## 187 NA NA NA
## 220 NA NA NA
###Make Cross Tabs
#upload data
data("UCBAdmissions")
DF=as.data.frame(UCBAdmissions)
#create a summary
summary(DF)
## Admit Gender Dept Freq
## Admitted:12 Male :12 A:4 Min. : 8.0
## Rejected:12 Female:12 B:4 1st Qu.: 80.0
## C:4 Median :170.0
## D:4 Mean :188.6
## E:4 3rd Qu.:302.5
## F:4 Max. :512.0
#identify relationships
xt <-xtabs(Freq~Gender + Admit, data =DF)
xt
## Admit
## Gender Admitted Rejected
## Male 1198 1493
## Female 557 1278
#replicate xtabs
warpbreaks$replicate <-rep(1:9, len =54)
#identify relationships - difficult to see
xt = xtabs(breaks ~.,data = warpbreaks)
xt
## , , replicate = 1
##
## tension
## wool L M H
## A 26 18 36
## B 27 42 20
##
## , , replicate = 2
##
## tension
## wool L M H
## A 30 21 21
## B 14 26 21
##
## , , replicate = 3
##
## tension
## wool L M H
## A 54 29 24
## B 29 19 24
##
## , , replicate = 4
##
## tension
## wool L M H
## A 25 17 18
## B 19 16 17
##
## , , replicate = 5
##
## tension
## wool L M H
## A 70 12 10
## B 29 39 13
##
## , , replicate = 6
##
## tension
## wool L M H
## A 52 18 43
## B 31 28 15
##
## , , replicate = 7
##
## tension
## wool L M H
## A 51 35 28
## B 41 21 15
##
## , , replicate = 8
##
## tension
## wool L M H
## A 26 30 15
## B 20 39 16
##
## , , replicate = 9
##
## tension
## wool L M H
## A 67 36 26
## B 44 29 28
# convert this to a flat table - easy to see
ftable(xt)
## replicate 1 2 3 4 5 6 7 8 9
## wool tension
## A L 26 30 54 25 70 52 51 26 67
## M 18 21 29 17 12 18 35 30 36
## H 36 21 24 18 10 43 28 15 26
## B L 27 14 29 19 29 31 41 20 44
## M 42 26 19 16 39 28 21 39 29
## H 20 21 24 17 13 15 15 16 28
###See the size of data
fakeData = rnorm(1e5)
object.size(fakeData)
## 800048 bytes
print(object.size(fakeData), units="Mb")
## 0.8 Mb
This is an R Markdown document, feel free to reach out for finer details.