Subsetting and Sorting
set.seed(13435)
X = data.frame("var1"=sample(1:5),"var2"=sample(6:10),"var3"=sample(11:15))
X
## var1 var2 var3
## 1 2 8 15
## 2 3 7 12
## 3 5 6 14
## 4 1 10 11
## 5 4 9 13
X = X[sample(1:5),]
X
## var1 var2 var3
## 1 2 8 15
## 4 1 10 11
## 2 3 7 12
## 3 5 6 14
## 5 4 9 13
X$var1[c(1,3)] = NA
X
## var1 var2 var3
## 1 NA 8 15
## 4 1 10 11
## 2 NA 7 12
## 3 5 6 14
## 5 4 9 13
X[,1]
## [1] NA 1 NA 5 4
X[,"var1"]
## [1] NA 1 NA 5 4
X[1:3,"var1"]
## [1] NA 1 NA
## logical &s and |s
X[(X$var1<=6 & X$var3>=10),]
## var1 var2 var3
## NA NA NA NA
## 4 1 10 11
## NA.1 NA NA NA
## 3 5 6 14
## 5 4 9 13
X[(X$var1<=6 | X$var3>=10), ]
## var1 var2 var3
## 1 NA 8 15
## 4 1 10 11
## 2 NA 7 12
## 3 5 6 14
## 5 4 9 13
X[which(X$var1<=6),]
## var1 var2 var3
## 4 1 10 11
## 3 5 6 14
## 5 4 9 13
sort(X$var1)
## [1] 1 4 5
sort(X$var1,decreasing = T)
## [1] 5 4 1
sort(X$var1,na.last = T)
## [1] 1 4 5 NA NA
X[order(X$var1),]
## var1 var2 var3
## 4 1 10 11
## 5 4 9 13
## 3 5 6 14
## 1 NA 8 15
## 2 NA 7 12
X[order(X$var1,X$var2),]
## var1 var2 var3
## 4 1 10 11
## 5 4 9 13
## 3 5 6 14
## 2 NA 7 12
## 1 NA 8 15
ordering with plyr
library(plyr)
## Warning: package 'plyr' was built under R version 3.3.3
arrange(X,var1)
## var1 var2 var3
## 1 1 10 11
## 2 4 9 13
## 3 5 6 14
## 4 NA 8 15
## 5 NA 7 12
arrange(X,desc(var1))
## var1 var2 var3
## 1 5 6 14
## 2 4 9 13
## 3 1 10 11
## 4 NA 8 15
## 5 NA 7 12
X$var4 = rnorm(5)
X
## var1 var2 var3 var4
## 1 NA 8 15 0.1875960
## 4 1 10 11 1.7869764
## 2 NA 7 12 0.4966936
## 3 5 6 14 0.0631830
## 5 4 9 13 -0.5361329
Y = cbind(X,rnorm(5))
Y
## var1 var2 var3 var4 rnorm(5)
## 1 NA 8 15 0.1875960 0.62578490
## 4 1 10 11 1.7869764 -2.45083750
## 2 NA 7 12 0.4966936 0.08909424
## 3 5 6 14 0.0631830 0.47838570
## 5 4 9 13 -0.5361329 1.00053336
Summarizing Data
if(!file.exists("./data")){
dir.create("./data")
}
url = "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD"
download.file(url,"./data/restaurants.csv",method="curl")
restData = read.csv("./data/restaurants.csv")
head(restData,n=3)
## name zipCode neighborhood councilDistrict policeDistrict
## 1 410 21206 Frankford 2 NORTHEASTERN
## 2 1919 21231 Fells Point 1 SOUTHEASTERN
## 3 SAUTE 21224 Canton 1 SOUTHEASTERN
## Location.1
## 1 4509 BELAIR ROAD\nBaltimore, MD\n
## 2 1919 FLEET ST\nBaltimore, MD\n
## 3 2844 HUDSON ST\nBaltimore, MD\n
summary(restData)
## name zipCode neighborhood
## MCDONALD'S : 8 Min. :-21226 Downtown :128
## POPEYES FAMOUS FRIED CHICKEN: 7 1st Qu.: 21202 Fells Point : 91
## SUBWAY : 6 Median : 21218 Inner Harbor: 89
## KENTUCKY FRIED CHICKEN : 5 Mean : 21185 Canton : 81
## BURGER KING : 4 3rd Qu.: 21226 Federal Hill: 42
## DUNKIN DONUTS : 4 Max. : 21287 Mount Vernon: 33
## (Other) :1293 (Other) :863
## councilDistrict policeDistrict
## Min. : 1.000 SOUTHEASTERN:385
## 1st Qu.: 2.000 CENTRAL :288
## Median : 9.000 SOUTHERN :213
## Mean : 7.191 NORTHERN :157
## 3rd Qu.:11.000 NORTHEASTERN: 72
## Max. :14.000 EASTERN : 67
## (Other) :145
## Location.1
## 1101 RUSSELL ST\nBaltimore, MD\n: 9
## 201 PRATT ST\nBaltimore, MD\n : 8
## 2400 BOSTON ST\nBaltimore, MD\n : 8
## 300 LIGHT ST\nBaltimore, MD\n : 5
## 300 CHARLES ST\nBaltimore, MD\n : 4
## 301 LIGHT ST\nBaltimore, MD\n : 4
## (Other) :1289
str(restData)
## 'data.frame': 1327 obs. of 6 variables:
## $ name : Factor w/ 1277 levels "#1 CHINESE KITCHEN",..: 9 3 992 1 2 4 5 6 7 8 ...
## $ zipCode : int 21206 21231 21224 21211 21223 21218 21205 21211 21205 21231 ...
## $ neighborhood : Factor w/ 173 levels "Abell","Arlington",..: 53 52 18 66 104 33 98 133 98 157 ...
## $ councilDistrict: int 2 1 1 14 9 14 13 7 13 1 ...
## $ policeDistrict : Factor w/ 9 levels "CENTRAL","EASTERN",..: 3 6 6 4 8 3 6 4 6 6 ...
## $ Location.1 : Factor w/ 1210 levels "1 BIDDLE ST\nBaltimore, MD\n",..: 835 334 554 755 492 537 505 530 507 569 ...
quantile(restData$councilDistrict,na.rm =T)
## 0% 25% 50% 75% 100%
## 1 2 9 11 14
quantile(restData$councilDistrict,probs=c(0.5,0.75,0.9))
## 50% 75% 90%
## 9 11 12