Data Manipulation
USE of SUBSET() COMMAND - seesm like “SELECT” of SQL
# Format: subset(dataframe, condition)
# Note: movies is present in ggplot2
# To keep only the R-rated movies
movies <- read.csv("c:/AGZ1/GD_AGZ1117/AGZ_Home/workspace_R/data/movies.csv", stringsAsFactors = FALSE)
head(subset(movies, mpaa=="R") )
## X title year length budget rating votes r1 r2 r3 r4
## 7 7 $windle 2002 93 NA 5.3 200 4.5 0.0 4.5 4.5
## 43 43 'R Xmas 2001 83 NA 4.9 288 14.5 4.5 4.5 4.5
## 123 123 100 Girls 2000 90 NA 5.8 3349 4.5 4.5 4.5 4.5
## 124 124 100 Mile Rule 2002 98 1100000 5.6 181 4.5 4.5 4.5 4.5
## 153 153 11:11 2004 95 NA 4.3 222 14.5 14.5 4.5 14.5
## 169 169 13 Dead Men 2003 85 NA 2.2 69 44.5 4.5 4.5 4.5
## r5 r6 r7 r8 r9 r10 mpaa Action Animation Comedy Drama
## 7 24.5 24.5 14.5 4.5 4.5 14.5 R 1 0 0 1
## 43 14.5 24.5 14.5 4.5 4.5 4.5 R 0 0 0 1
## 123 14.5 14.5 24.5 14.5 4.5 4.5 R 0 0 1 0
## 124 14.5 24.5 14.5 14.5 4.5 14.5 R 0 0 1 0
## 153 14.5 14.5 4.5 4.5 4.5 14.5 R 0 0 0 0
## 169 4.5 4.5 4.5 4.5 0.0 14.5 R 1 0 0 0
## Documentary Romance Short
## 7 0 0 0
## 43 0 0 0
## 123 0 1 0
## 124 0 0 0
## 153 0 0 0
## 169 0 0 0
head(subset(iris, Species=="versicolor") )
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 51 7.0 3.2 4.7 1.4 versicolor
## 52 6.4 3.2 4.5 1.5 versicolor
## 53 6.9 3.1 4.9 1.5 versicolor
## 54 5.5 2.3 4.0 1.3 versicolor
## 55 6.5 2.8 4.6 1.5 versicolor
## 56 5.7 2.8 4.5 1.3 versicolor
head(subset(iris, Sepal.Length > 7) )
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 103 7.1 3.0 5.9 2.1 virginica
## 106 7.6 3.0 6.6 2.1 virginica
## 108 7.3 2.9 6.3 1.8 virginica
## 110 7.2 3.6 6.1 2.5 virginica
## 118 7.7 3.8 6.7 2.2 virginica
## 119 7.7 2.6 6.9 2.3 virginica
#Let's find the Murder Rate of the States in the highest Quartile
summary(USArrests)
## Murder Assault UrbanPop Rape
## Min. : 0.800 Min. : 45.0 Min. :32.00 Min. : 7.30
## 1st Qu.: 4.075 1st Qu.:109.0 1st Qu.:54.50 1st Qu.:15.07
## Median : 7.250 Median :159.0 Median :66.00 Median :20.10
## Mean : 7.788 Mean :170.8 Mean :65.54 Mean :21.23
## 3rd Qu.:11.250 3rd Qu.:249.0 3rd Qu.:77.75 3rd Qu.:26.18
## Max. :17.400 Max. :337.0 Max. :91.00 Max. :46.00
head(USArrests)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
topQuartileMurderRate <- quantile(USArrests$Murder)[4]
#Now print the Arrests for these states with the highest Arrests for Murder
subset(USArrests, Murder > topQuartileMurderRate )
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Florida 15.4 335 80 31.9
## Georgia 17.4 211 60 25.8
## Louisiana 15.4 249 66 22.2
## Maryland 11.3 300 67 27.8
## Michigan 12.1 255 74 35.1
## Mississippi 16.1 259 44 17.1
## Nevada 12.2 252 81 46.0
## New Mexico 11.4 285 70 32.1
## North Carolina 13.0 337 45 16.1
## South Carolina 14.4 279 48 22.5
## Tennessee 13.2 188 59 26.9
## Texas 12.7 201 80 25.5
# USE OF Subset to get condition and only print out certain columns.
# We have to use the select argument to get the columns we want.
# If multiple columns are desired, use c(column1, column2, column3)
subset(USArrests, Murder > topQuartileMurderRate, select=UrbanPop )
## UrbanPop
## Alabama 58
## Florida 80
## Georgia 60
## Louisiana 66
## Maryland 67
## Michigan 74
## Mississippi 44
## Nevada 81
## New Mexico 70
## North Carolina 45
## South Carolina 48
## Tennessee 59
## Texas 80
# TO DROP COLUMNS FROM DATA SETS USING just the column names subset()
smallUSArrests <- subset(USArrests, select=c(-UrbanPop, -Rape)) #Minus means we don't want those columns
head(smallUSArrests)
## Murder Assault
## Alabama 13.2 236
## Alaska 10.0 263
## Arizona 8.1 294
## Arkansas 8.8 190
## California 9.0 276
## Colorado 7.9 204
Aggregating using TABLE
#Usage: table(vector)
table(iris$Species) #How many of each species are there?
##
## setosa versicolor virginica
## 50 50 50
head( subset(iris, Species == "setosa") ) # Show "setoda" only
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
mtcars$gear
## [1] 4 4 4 3 3 3 3 4 4 4 4 3 3 3 3 3 3 4 4 4 3 3 3 3 3 4 5 5 5 5 5 4
## Levels: 3 4 5
mtcars$cyl
## [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4
table(mtcars$gear)
##
## 3 4 5
## 15 12 5
table(mtcars$cyl)
##
## 4 6 8
## 11 7 14
table(mtcars$gear, mtcars$cyl) # put it together to create a summary table
##
## 4 6 8
## 3 1 2 12
## 4 8 4 0
## 5 2 1 2
names(movies)
## [1] "X" "title" "year" "length" "budget"
## [6] "rating" "votes" "r1" "r2" "r3"
## [11] "r4" "r5" "r6" "r7" "r8"
## [16] "r9" "r10" "mpaa" "Action" "Animation"
## [21] "Comedy" "Drama" "Documentary" "Romance" "Short"
library(ggplot2)
# Table Using "with" -- notice that we don't need to use $
with(movies, table(year))
## year
## 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907
## 1 9 3 13 9 5 9 16 28 9 37 42 17 17 12
## 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922
## 24 30 26 22 34 32 54 54 49 37 41 52 43 53 52
## 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937
## 48 50 79 94 83 109 184 288 346 412 421 482 464 484 484
## 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952
## 463 484 503 521 507 446 426 375 432 439 478 486 491 518 513
## 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967
## 539 501 522 497 556 528 495 478 466 513 503 517 530 579 594
## 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982
## 651 625 586 646 637 634 625 619 665 617 609 632 681 661 689
## 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997
## 698 749 792 792 957 944 944 899 888 948 1016 1199 1248 1390 1568
## 1998 1999 2000 2001 2002 2003 2004 2005
## 1705 1927 2048 2121 2168 2158 1945 349
with(movies, table(length))
## length
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 169 116 243 185 279 726 1379 700 432 576 347 311 224 229 349
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
## 240 258 252 176 376 141 147 126 97 161 111 100 123 107 225
## 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
## 32 45 36 40 68 21 29 38 22 117 30 29 22 22 74
## 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 21 43 50 12 129 27 88 73 102 148 147 136 187 123 409
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## 188 193 181 172 279 182 240 246 232 523 288 398 331 295 653
## 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 364 397 514 421 1149 567 803 768 826 1497 941 1152 1321 1134 3506
## 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
## 1304 1502 1395 1271 1900 1212 1199 1122 869 1711 700 853 752 719 1193
## 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 553 561 558 431 910 348 422 326 277 457 240 258 300 211 496
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
## 168 169 164 174 190 125 113 121 85 151 68 89 68 79 122
## 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
## 56 53 67 50 109 49 47 51 35 59 40 31 25 24 65
## 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
## 20 31 36 22 34 21 26 30 21 43 17 24 19 16 38
## 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 21 40 23 14 42 16 19 15 11 26 14 15 13 12 34
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
## 7 7 7 6 9 11 10 11 7 8 3 9 6 5 8
## 196 197 198 199 200 201 202 204 205 206 207 208 209 210 211
## 6 4 2 2 14 4 4 4 2 5 2 5 2 7 3
## 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226
## 7 3 2 5 5 2 4 5 7 1 1 3 3 6 3
## 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241
## 3 2 3 1 5 2 2 1 3 2 2 1 2 12 1
## 242 243 244 245 246 247 248 250 251 252 254 255 256 259 260
## 4 2 3 6 2 2 2 2 3 3 2 1 1 2 3
## 261 263 264 265 267 269 270 275 277 278 279 283 285 288 293
## 1 4 1 4 1 2 2 2 1 3 1 1 5 2 1
## 294 298 299 300 301 302 306 311 312 315 316 320 321 328 358
## 1 1 1 4 1 1 1 1 1 1 2 2 1 1 3
## 360 384 390 399 402 407 410 417 418 480 485 501 555 566 647
## 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1
## 773 873 1100 2880 5220
## 1 1 1 1 1
with(movies, table(length>200))
##
## FALSE TRUE
## 58538 250
X T A B S # C ROSS TABULATION
AGGREGATE
# Aggregate WITHOUT USING THE FORMULA SYNTAX
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA 0 14.3 56 5 5
## 6 28 0 14.9 66 5 6
aggregate(airquality, by=list(airquality$Month), FUN=mean)
## Group.1 Ozone Solar.R Wind Temp Month Day
## 1 5 NA 157.9032 11.622581 65.54839 5 16.0
## 2 6 NA 190.1667 10.266667 79.10000 6 15.5
## 3 7 NA 216.4839 8.941935 83.90323 7 16.0
## 4 8 NA 155.2258 8.793548 83.96774 8 16.0
## 5 9 NA 167.4333 10.180000 76.90000 9 15.5
aggregate(airquality, by=list(airquality$Month), FUN=mean, na.rm=TRUE)
## Group.1 Ozone Solar.R Wind Temp Month Day
## 1 5 23.61538 157.9032 11.622581 65.54839 5 16.0
## 2 6 29.44444 190.1667 10.266667 79.10000 6 15.5
## 3 7 59.11538 216.4839 8.941935 83.90323 7 16.0
## 4 8 59.96154 155.2258 8.793548 83.96774 8 16.0
## 5 9 31.44828 167.4333 10.180000 76.90000 9 15.5
head(aggregate(airquality, by=list(airquality$Month, airquality$Day), FUN=mean, na.rm=TRUE) )
## Group.1 Group.2 Ozone Solar.R Wind Temp Month Day
## 1 5 1 41 190 7.4 67 5 1
## 2 6 1 NaN 286 8.6 78 6 1
## 3 7 1 135 269 4.1 84 7 1
## 4 8 1 39 83 6.9 81 8 1
## 5 9 1 96 167 6.9 91 9 1
## 6 5 2 36 118 8.0 72 5 2
#BONUS Example
aggregate(state.x77, list(Region = state.region), mean)
## Region Population Income Illiteracy Life Exp Murder HS Grad
## 1 Northeast 5495.111 4570.222 1.000000 71.26444 4.722222 53.96667
## 2 South 4208.125 4011.938 1.737500 69.70625 10.581250 44.34375
## 3 North Central 4803.000 4611.083 0.700000 71.76667 5.275000 54.51667
## 4 West 2915.308 4702.615 1.023077 71.23462 7.215385 62.00000
## Frost Area
## 1 132.7778 18141.00
## 2 64.6250 54605.12
## 3 138.8333 62652.00
## 4 102.1538 134463.00
#-----
# USING FORUMLA SYNTAX
#------
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA 0 14.3 56 5 5
## 6 28 0 14.9 66 5 6
aggregate(data=airquality, Ozone~Month, mean)
## Month Ozone
## 1 5 23.61538
## 2 6 29.44444
## 3 7 59.11538
## 4 8 59.96154
## 5 9 31.44828
head(aggregate(data=airquality, Ozone~Month+Day, mean) )
## Month Day Ozone
## 1 5 1 41
## 2 7 1 135
## 3 8 1 39
## 4 9 1 96
## 5 5 2 36
## 6 7 2 49
#use cbind if you want to aggregate more than one variable
head(aggregate(data=airquality, cbind(Ozone,Wind)~Month+Day, mean) )
## Month Day Ozone Wind
## 1 5 1 41 7.4
## 2 7 1 135 4.1
## 3 8 1 39 6.9
## 4 9 1 96 6.9
## 5 5 2 36 8.0
## 6 7 2 49 9.2
# Bonus Example: IRIS DATA SET
aggregate(. ~Species, data=iris, FUN='length')
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 setosa 50 50 50 50
## 2 versicolor 50 50 50 50
## 3 virginica 50 50 50 50
aggregate(Sepal.Length ~ Species, data=iris, FUN='length')
## Species Sepal.Length
## 1 setosa 50
## 2 versicolor 50
## 3 virginica 50
head(aggregate(Species ~ Sepal.Length, data=iris, FUN='length') ) #this is NOT what we want
## Sepal.Length Species
## 1 4.3 1
## 2 4.4 3
## 3 4.5 1
## 4 4.6 4
## 5 4.7 2
## 6 4.8 5
aggregate(Sepal.Length ~ Species, data=iris, FUN='mean')
## Species Sepal.Length
## 1 setosa 5.006
## 2 versicolor 5.936
## 3 virginica 6.588
C U T
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA 0 14.3 56 5 5
## 6 28 0 14.9 66 5 6
range(airquality$Temp)
## [1] 56 97
# [1] 56 97
# First let's cut this vector into 5 groups:
head(cut(airquality$Temp, 5) )
## [1] (64.2,72.4] (64.2,72.4] (72.4,80.6] (56,64.2] (56,64.2] (64.2,72.4]
## Levels: (56,64.2] (64.2,72.4] (72.4,80.6] (80.6,88.8] (88.8,97]
#---- Notice that R cut the range into 5 exactly equal intervals.
#If we use the labels=FALSE argument, we get an easier to read output
cut(airquality$Temp, 5, labels=FALSE)
## [1] 2 2 3 1 1 2 2 1 1 2 3 2 2 2 1 1 2 1 2 1 1 3 1 1 1 1 1 2 4 3 3 3 3 2 4
## [36] 4 3 4 4 5 4 5 5 4 3 3 3 2 2 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 5 5
## [71] 5 4 3 4 5 3 4 4 4 4 4 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 4 4 4
## [106] 3 3 3 3 3 3 3 3 2 3 3 4 4 4 5 5 5 5 5 5 5 5 4 4 3 3 3 3 4 3 3 2 2 3 2
## [141] 3 2 4 1 2 4 2 1 2 3 3 3 2
#How many data points fall in each of the 5 intervals?
table(cut(airquality$Temp, 5))
##
## (56,64.2] (64.2,72.4] (72.4,80.6] (80.6,88.8] (88.8,97]
## 16 23 46 49 19
#-----Creating "NICE" Groups (Friendlier for human reading)
TempBreaks=seq(50,100, by=10)
TempBuckets <- cut(airquality$Temp, breaks=TempBreaks)
summary(TempBuckets)
## (50,60] (60,70] (70,80] (80,90] (90,100]
## 8 25 52 54 14
#Let's create a NEW COLUMN in the data frame to store the bucket number
airquality$TempBuckets <- cut(airquality$Temp, TempBreaks, labels=FALSE)
#Let's cut the airquality Temp into 3 groups: High Medium and Low
high_medium_low <- cut(airquality$Temp, breaks=c(0, 70, 80, 100), labels=c("Low","Medium","High"))
summary(high_medium_low)
## Low Medium High
## 33 52 68
A P P L Y Family of Functions
v <- c(40,2,83,28,58)
f <- factor(c("A","C","C","B","C"))
v
## [1] 40 2 83 28 58
f
## [1] A C C B C
## Levels: A B C
data(diamonds)
groups <- split(x, f)
## Warning in split.default(x, f): data length is not a multiple of split
## variable
groups
## $A
## [1] "8 23 37 38 20 40 39 41 21 33"
##
## $B
## character(0)
##
## $C
## character(0)
groups <- unstack(data.frame(x,f))
library(MASS)
split(Cars93$MPG.city, Cars93$Origin)$USA
## [1] 22 19 16 19 16 16 25 25 19 21 18 15 17 17 20 23 20 29 23 22 17 21 18
## [24] 29 20 31 23 22 22 24 15 21 18 17 18 23 19 24 23 18 19 23 31 23 19 19
## [47] 19 28
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
apply(mtcars, 1, max) # take each row of mtcars, and find its max value.
## Mazda RX4 Mazda RX4 Wag Datsun 710
## "6" "6" "4"
## Hornet 4 Drive Hornet Sportabout Valiant
## "6" "8" "6"
## Duster 360 Merc 240D Merc 230
## "8" "4" "4"
## Merc 280 Merc 280C Merc 450SE
## "6" "6" "8"
## Merc 450SL Merc 450SLC Cadillac Fleetwood
## "8" "8" "8"
## Lincoln Continental Chrysler Imperial Fiat 128
## "8" "8" "4.08"
## Honda Civic Toyota Corolla Toyota Corona
## "4.93" "4.22" "4"
## Dodge Challenger AMC Javelin Camaro Z28
## "8" "8" "8"
## Pontiac Firebird Fiat X1-9 Porsche 914-2
## "8" "4.08" "5"
## Lotus Europa Ford Pantera L Ferrari Dino
## "5" "8" "6"
## Maserati Bora Volvo 142E
## "8" "4.11"
apply(iris[,1:4], 1, mean) #caution: This is just for illustration.
## [1] 2.550 2.375 2.350 2.350 2.550 2.850 2.425 2.525 2.225 2.400 2.700
## [12] 2.500 2.325 2.125 2.800 3.000 2.750 2.575 2.875 2.675 2.675 2.675
## [23] 2.350 2.650 2.575 2.450 2.600 2.600 2.550 2.425 2.425 2.675 2.725
## [34] 2.825 2.425 2.400 2.625 2.500 2.225 2.550 2.525 2.100 2.275 2.675
## [45] 2.800 2.375 2.675 2.350 2.675 2.475 4.075 3.900 4.100 3.275 3.850
## [56] 3.575 3.975 2.900 3.850 3.300 2.875 3.650 3.300 3.775 3.350 3.900
## [67] 3.650 3.400 3.600 3.275 3.925 3.550 3.800 3.700 3.725 3.850 3.950
## [78] 4.100 3.725 3.200 3.200 3.150 3.400 3.850 3.600 3.875 4.000 3.575
## [89] 3.500 3.325 3.425 3.775 3.400 2.900 3.450 3.525 3.525 3.675 2.925
## [100] 3.475 4.525 3.875 4.525 4.150 4.375 4.825 3.400 4.575 4.200 4.850
## [111] 4.200 4.075 4.350 3.800 4.025 4.300 4.200 5.100 4.875 3.675 4.525
## [122] 3.825 4.800 3.925 4.450 4.550 3.900 3.950 4.225 4.400 4.550 5.025
## [133] 4.250 3.925 3.925 4.775 4.425 4.200 3.900 4.375 4.450 4.350 3.875
## [144] 4.550 4.550 4.300 3.925 4.175 4.325 3.950
#taking the mean of a bunch of different columns usually doesn't make mathematical sense.
#applying functions to columns
apply(mtcars, 2, summary)
## mpg cyl disp hp drat
## Length "32" "32" "32" "32" "32"
## Class "character" "character" "character" "character" "character"
## Mode "character" "character" "character" "character" "character"
## wt qsec vs am gear
## Length "32" "32" "32" "32" "32"
## Class "character" "character" "character" "character" "character"
## Mode "character" "character" "character" "character" "character"
## carb
## Length "32"
## Class "character"
## Mode "character"
# L A P P L Y
lst <- list(1,"abc", 1.3, TRUE)
listClasses <- lapply(lst, class)
listClasses
## [[1]]
## [1] "numeric"
##
## [[2]]
## [1] "character"
##
## [[3]]
## [1] "numeric"
##
## [[4]]
## [1] "logical"
#--- SAPPLY might be even better
#--- understand the difference between the two commands:
lClasses <- lapply(lst, class)
sClasses <- sapply(lst,class)
str(lClasses)
## List of 4
## $ : chr "numeric"
## $ : chr "character"
## $ : chr "numeric"
## $ : chr "logical"
str(sClasses)
## chr [1:4] "numeric" "character" "numeric" "logical"
lClasses <- lapply(mtcars, mean)
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
sClasses <- sapply(mtcars,mean)
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
str(lClasses)
## List of 11
## $ mpg : num 20.1
## $ cyl : num 6.19
## $ disp: num 231
## $ hp : num 147
## $ drat: num 3.6
## $ wt : num 3.22
## $ qsec: num 17.8
## $ vs : num 0.438
## $ am : num 0.406
## $ gear: num NA
## $ carb: num 2.81
str(sClasses)
## Named num [1:11] 20.09 6.19 230.72 146.69 3.6 ...
## - attr(*, "names")= chr [1:11] "mpg" "cyl" "disp" "hp" ...
head(lapply(airquality$Ozone, round) )
## [[1]]
## [1] 41
##
## [[2]]
## [1] 36
##
## [[3]]
## [1] 12
##
## [[4]]
## [1] 18
##
## [[5]]
## [1] NA
##
## [[6]]
## [1] 28
sapply(airquality$Ozone, round)
## [1] 41 36 12 18 NA 28 23 19 8 NA 7 16 11 14 18 14 34
## [18] 6 30 11 1 11 4 32 NA NA NA 23 45 115 37 NA NA NA
## [35] NA NA NA 29 NA 71 39 NA NA 23 NA NA 21 37 20 12 13
## [52] NA NA NA NA NA NA NA NA NA NA 135 49 32 NA 64 40 77
## [69] 97 97 85 NA 10 27 NA 7 48 35 61 79 63 16 NA NA 80
## [86] 108 20 52 82 50 64 59 39 9 16 78 35 66 122 89 110 NA
## [103] NA 44 28 65 NA 22 59 23 31 44 21 9 NA 45 168 73 NA
## [120] 76 118 84 85 96 78 73 91 47 32 20 23 21 24 44 21 28
## [137] 9 13 46 18 13 24 16 13 23 36 7 14 30 NA 14 18 20
sapply(airquality, max)
## Ozone Solar.R Wind Temp Month Day
## NA 334.0 20.7 97.0 9.0 31.0
## TempBuckets
## 5.0
head(airquality)
## Ozone Solar.R Wind Temp Month Day TempBuckets
## 1 41 190 7.4 67 5 1 2
## 2 36 118 8.0 72 5 2 3
## 3 12 149 12.6 74 5 3 3
## 4 18 313 11.5 62 5 4 2
## 5 NA 0 14.3 56 5 5 1
## 6 28 0 14.9 66 5 6 2
T A P P L Y
#hts of 10 people
heights <- c(177, 153, 133, 121, 164, 161, 127, 122, 180, 161, 131, 128)
groupIndex <- c("Male", "Woman", "Child", "Child","Male", "Woman", "Child", "Child","Male", "Woman", "Child", "Child")
#IMPORTANT: heights and index should be of the same length.
# Each element of the vector should have an group identified in the Index vector.
tapply(heights, groupIndex, mean)
## Child Male Woman
## 127.0000 173.6667 158.3333
# Can not find the data now:
# gdp <- read.csv("~/data/countries_wide.csv") #replace this with the right path for your file
# gdp <- read.csv("data/countryCodes.csv") #replace this with the right path for your file
# str(gdp)
# tapply(gdp$POP, gdp$country.isocode, mean)
# tapply(gdp$POP, list(gdp$country.isocode,gdp$year), mean)
############
# D D P L Y
##########
library(plyr)
#Let's use the in-built Titanic dataset to understand ddply
#what does ddply do?
# DDPLY: For each subset of a data frame, apply function then combine results into a data frame
str(Titanic)
## table [1:4, 1:2, 1:2, 1:2] 0 0 35 0 0 0 17 0 118 154 ...
## - attr(*, "dimnames")=List of 4
## ..$ Class : chr [1:4] "1st" "2nd" "3rd" "Crew"
## ..$ Sex : chr [1:2] "Male" "Female"
## ..$ Age : chr [1:2] "Child" "Adult"
## ..$ Survived: chr [1:2] "No" "Yes"
titanic <- as.data.frame(Titanic)
head(titanic)
## Class Sex Age Survived Freq
## 1 1st Male Child No 0
## 2 2nd Male Child No 0
## 3 3rd Male Child No 35
## 4 Crew Male Child No 0
## 5 1st Female Child No 0
## 6 2nd Female Child No 0
ddply(titanic, .(Class), summarize, Total=sum(Freq))
## Class Total
## 1 1st 325
## 2 2nd 285
## 3 3rd 706
## 4 Crew 885
ddply(titanic, .(Class, Survived), summarize, Total=sum(Freq))
## Class Survived Total
## 1 1st No 122
## 2 1st Yes 203
## 3 2nd No 167
## 4 2nd Yes 118
## 5 3rd No 528
## 6 3rd Yes 178
## 7 Crew No 673
## 8 Crew Yes 212
ddply(titanic, .(Sex, Survived), summarize, Total=sum(Freq))
## Sex Survived Total
## 1 Male No 1364
## 2 Male Yes 367
## 3 Female No 126
## 4 Female Yes 344
ddply(titanic, .(Class, Sex, Survived), summarize, Total=sum(Freq))
## Class Sex Survived Total
## 1 1st Male No 118
## 2 1st Male Yes 62
## 3 1st Female No 4
## 4 1st Female Yes 141
## 5 2nd Male No 154
## 6 2nd Male Yes 25
## 7 2nd Female No 13
## 8 2nd Female Yes 93
## 9 3rd Male No 422
## 10 3rd Male Yes 88
## 11 3rd Female No 106
## 12 3rd Female Yes 90
## 13 Crew Male No 670
## 14 Crew Male Yes 192
## 15 Crew Female No 3
## 16 Crew Female Yes 20
str(titanic)
## 'data.frame': 32 obs. of 5 variables:
## $ Class : Factor w/ 4 levels "1st","2nd","3rd",..: 1 2 3 4 1 2 3 4 1 2 ...
## $ Sex : Factor w/ 2 levels "Male","Female": 1 1 1 1 2 2 2 2 1 1 ...
## $ Age : Factor w/ 2 levels "Child","Adult": 1 1 1 1 1 1 1 1 2 2 ...
## $ Survived: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Freq : num 0 0 35 0 0 0 17 0 118 154 ...
M E R G E
We want to fill in all missing hrs from 1 to 10, and fill in the value 0 for those.
x <- data.frame(hrs=c(3,4,7), values=1:3)
x
## hrs values
## 1 3 1
## 2 4 2
## 3 7 3
allhours <- data.frame(hours=1:10)
allhours
## hours
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
## 7 7
## 8 8
## 9 9
## 10 10
merge(x, allhours, all=T, by.y="hours", by.x="hrs")
## hrs values
## 1 1 NA
## 2 2 NA
## 3 3 1
## 4 4 2
## 5 5 NA
## 6 6 NA
## 7 7 3
## 8 8 NA
## 9 9 NA
## 10 10 NA
# These have error, I can't make them work now:
# df <- merge(x, allhours, all.y=TRUE, by=c("hrs"))
# df # convert the NA to 0
# df$values[is.na(df$values)] <- 0
# df
TIDY DATA: RESHAPING (ref. p8 of ITDA_Lecture+4a)
Wide Data: good for human to read; Long data: good for R
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
dim(iris)
## [1] 150 5
names(iris)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## [5] "Species"
library(reshape2)
m.iris <- melt(iris)
## Using Species as id variables
names(m.iris)
## [1] "Species" "variable" "value"
dim(m.iris)
## [1] 600 3
head(m.iris)
## Species variable value
## 1 setosa Sepal.Length 5.1
## 2 setosa Sepal.Length 4.9
## 3 setosa Sepal.Length 4.7
## 4 setosa Sepal.Length 4.6
## 5 setosa Sepal.Length 5.0
## 6 setosa Sepal.Length 5.4
dcast(m.iris, Species~variable)
## Aggregation function missing: defaulting to length
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 setosa 50 50 50 50
## 2 versicolor 50 50 50 50
## 3 virginica 50 50 50 50
m.iris <- melt(data=iris, id.var="Species")
names(m.iris)
## [1] "Species" "variable" "value"
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
#m.iris
head(m.iris)
## Species variable value
## 1 setosa Sepal.Length 5.1
## 2 setosa Sepal.Length 4.9
## 3 setosa Sepal.Length 4.7
## 4 setosa Sepal.Length 4.6
## 5 setosa Sepal.Length 5.0
## 6 setosa Sepal.Length 5.4
#Once you melt it this way, no way to "remember" which rows came from where.
#need a row ID
iris$id <- row.names(iris)
irislong <- melt(iris, c("Species", "id"))
head(dcast(irislong, Species+id~variable) )
## Species id Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 setosa 1 5.1 3.5 1.4 0.2
## 2 setosa 10 4.9 3.1 1.5 0.1
## 3 setosa 11 5.4 3.7 1.5 0.2
## 4 setosa 12 4.8 3.4 1.6 0.2
## 5 setosa 13 4.8 3.0 1.4 0.1
## 6 setosa 14 4.3 3.0 1.1 0.1
# gdp
# melt(gdp)
# m.gdp <- melt(gdp, id.vars=c("country","country.isocode", "year")) # Has error now
# names(m.gdp)
# dcast(m.gdp, country.isocode+year ~ variable)
## DATE AND TIME MANIPULATION
as.Date('2013-10-30')
## [1] "2013-10-30"
dte <- c("02/27/13")
as.Date(dte, "%m/%d/%y")
## [1] "2013-02-27"
# You can use ISOdate to combine elements and then convert to a date type
# ISOdate(year, month, day) and then as.Date()
as.Date(ISOdate(2013,1,1))
## [1] "2013-01-01"
#The format() argument inside date
#format(Sys.Date(), format=âÂÂ%m/%d/%YâÂÂ
# Extracting Parts of a Date
d <- as.Date("2013-10-15")
p <- as.POSIXlt(d)
# Get parts of p
p$mon; p$year+1900
## [1] 9
## [1] 2013
s <- as.Date("2013-10-01")
e <- as.Date("2013-11-01")
#Creating a Sequence of Dates
seq(from=s, to=e, by=1)
## [1] "2013-10-01" "2013-10-02" "2013-10-03" "2013-10-04" "2013-10-05"
## [6] "2013-10-06" "2013-10-07" "2013-10-08" "2013-10-09" "2013-10-10"
## [11] "2013-10-11" "2013-10-12" "2013-10-13" "2013-10-14" "2013-10-15"
## [16] "2013-10-16" "2013-10-17" "2013-10-18" "2013-10-19" "2013-10-20"
## [21] "2013-10-21" "2013-10-22" "2013-10-23" "2013-10-24" "2013-10-25"
## [26] "2013-10-26" "2013-10-27" "2013-10-28" "2013-10-29" "2013-10-30"
## [31] "2013-10-31" "2013-11-01"
seq(from=s, by='year', length.out=3)
## [1] "2013-10-01" "2014-10-01" "2015-10-01"
DATA MANIPULATION WITH DPLYR
# SIMPLE INTRODUCTION TO PIPES
round(sqrt(1000), 3) #usual R way of doing things
## [1] 31.623
library(magrittr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
1000 %>% sqrt %>% round()
## [1] 32
1000 %>% sqrt %>% round(.,3)
## [1] 31.623
####
# DPLYR BASICS
#####
#TBL_DF & GLIMPSE()
#pretty print a data frame
library(ggplot2)
pretty_movies <- tbl_df(movies)
head(movies)
## X title year length budget rating votes r1 r2 r3
## 1 1 $ 1971 121 NA 6.4 348 4.5 4.5 4.5
## 2 2 $1000 a Touchdown 1939 71 NA 6.0 20 0.0 14.5 4.5
## 3 3 $21 a Day Once a Month 1941 7 NA 8.2 5 0.0 0.0 0.0
## 4 4 $40,000 1996 70 NA 8.2 6 14.5 0.0 0.0
## 5 5 $50,000 Climax Show, The 1975 71 NA 3.4 17 24.5 4.5 0.0
## 6 6 $pent 2000 91 NA 4.3 45 4.5 4.5 4.5
## r4 r5 r6 r7 r8 r9 r10 mpaa Action Animation Comedy Drama
## 1 4.5 14.5 24.5 24.5 14.5 4.5 4.5 0 0 1 1
## 2 24.5 14.5 14.5 14.5 4.5 4.5 14.5 0 0 1 0
## 3 0.0 0.0 24.5 0.0 44.5 24.5 24.5 0 1 0 0
## 4 0.0 0.0 0.0 0.0 0.0 34.5 45.5 0 0 1 0
## 5 14.5 14.5 4.5 0.0 0.0 0.0 24.5 0 0 0 0
## 6 14.5 14.5 14.5 4.5 4.5 14.5 14.5 0 0 0 1
## Documentary Romance Short
## 1 0 0 0
## 2 0 0 0
## 3 0 0 1
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
pretty_movies #notice that is prints nicely
## # A tibble: 58,788 × 25
## X title year length budget rating votes r1
## <int> <chr> <int> <int> <int> <dbl> <int> <dbl>
## 1 1 $ 1971 121 NA 6.4 348 4.5
## 2 2 $1000 a Touchdown 1939 71 NA 6.0 20 0.0
## 3 3 $21 a Day Once a Month 1941 7 NA 8.2 5 0.0
## 4 4 $40,000 1996 70 NA 8.2 6 14.5
## 5 5 $50,000 Climax Show, The 1975 71 NA 3.4 17 24.5
## 6 6 $pent 2000 91 NA 4.3 45 4.5
## 7 7 $windle 2002 93 NA 5.3 200 4.5
## 8 8 '15' 2002 25 NA 6.7 24 4.5
## 9 9 '38 1987 97 NA 6.6 18 4.5
## 10 10 '49-'17 1917 61 NA 6.0 51 4.5
## # ... with 58,778 more rows, and 17 more variables: r2 <dbl>, r3 <dbl>,
## # r4 <dbl>, r5 <dbl>, r6 <dbl>, r7 <dbl>, r8 <dbl>, r9 <dbl>, r10 <dbl>,
## # mpaa <chr>, Action <int>, Animation <int>, Comedy <int>, Drama <int>,
## # Documentary <int>, Romance <int>, Short <int>
glimpse(movies) #easier to read than str()
## Observations: 58,788
## Variables: 25
## $ X <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ title <chr> "$", "$1000 a Touchdown", "$21 a Day Once a Month"...
## $ year <int> 1971, 1939, 1941, 1996, 1975, 2000, 2002, 2002, 19...
## $ length <int> 121, 71, 7, 70, 71, 91, 93, 25, 97, 61, 99, 96, 10...
## $ budget <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ rating <dbl> 6.4, 6.0, 8.2, 8.2, 3.4, 4.3, 5.3, 6.7, 6.6, 6.0, ...
## $ votes <int> 348, 20, 5, 6, 17, 45, 200, 24, 18, 51, 23, 53, 44...
## $ r1 <dbl> 4.5, 0.0, 0.0, 14.5, 24.5, 4.5, 4.5, 4.5, 4.5, 4.5...
## $ r2 <dbl> 4.5, 14.5, 0.0, 0.0, 4.5, 4.5, 0.0, 4.5, 4.5, 0.0,...
## $ r3 <dbl> 4.5, 4.5, 0.0, 0.0, 0.0, 4.5, 4.5, 4.5, 4.5, 4.5, ...
## $ r4 <dbl> 4.5, 24.5, 0.0, 0.0, 14.5, 14.5, 4.5, 4.5, 0.0, 4....
## $ r5 <dbl> 14.5, 14.5, 0.0, 0.0, 14.5, 14.5, 24.5, 4.5, 0.0, ...
## $ r6 <dbl> 24.5, 14.5, 24.5, 0.0, 4.5, 14.5, 24.5, 14.5, 0.0,...
## $ r7 <dbl> 24.5, 14.5, 0.0, 0.0, 0.0, 4.5, 14.5, 14.5, 34.5, ...
## $ r8 <dbl> 14.5, 4.5, 44.5, 0.0, 0.0, 4.5, 4.5, 14.5, 14.5, 4...
## $ r9 <dbl> 4.5, 4.5, 24.5, 34.5, 0.0, 14.5, 4.5, 4.5, 4.5, 4....
## $ r10 <dbl> 4.5, 14.5, 24.5, 45.5, 24.5, 14.5, 14.5, 14.5, 24....
## $ mpaa <chr> "", "", "", "", "", "", "R", "", "", "", "", "", "...
## $ Action <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,...
## $ Animation <int> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Comedy <int> 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,...
## $ Drama <int> 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,...
## $ Documentary <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Romance <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Short <int> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,...
FILTER
movies_with_budgets <- filter(movies, !is.na(budget))
head(filter(movies, Documentary==1) )
## X title year length budget
## 1 8 '15' 2002 25 NA
## 2 18 'Bullitt': Steve McQueen's Commitment to Reality 1968 10 NA
## 3 50 'What's Your 'I.Q.'?' Number Two 1940 9 NA
## 4 101 1 Giant Leap 2002 155 NA
## 5 104 1,99 - Um Supermercado Que Vende Palavras 2003 72 NA
## 6 107 1/2 Mensch 1986 48 NA
## rating votes r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 mpaa Action
## 1 6.7 24 4.5 4.5 4.5 4.5 4.5 14.5 14.5 14.5 4.5 14.5 0
## 2 6.6 37 0.0 0.0 4.5 4.5 4.5 14.5 44.5 14.5 4.5 4.5 0
## 3 5.9 9 0.0 0.0 14.5 14.5 24.5 24.5 14.5 0.0 14.5 14.5 0
## 4 6.6 75 4.5 4.5 0.0 4.5 4.5 14.5 24.5 14.5 14.5 34.5 0
## 5 4.9 28 4.5 4.5 4.5 4.5 14.5 4.5 14.5 24.5 4.5 14.5 0
## 6 8.1 29 4.5 0.0 0.0 0.0 4.5 0.0 0.0 4.5 24.5 64.5 0
## Animation Comedy Drama Documentary Romance Short
## 1 0 0 0 1 0 1
## 2 0 0 0 1 0 1
## 3 0 0 0 1 0 1
## 4 0 0 0 1 0 0
## 5 0 1 0 1 0 0
## 6 0 0 0 1 0 0
filter(movies, Documentary==1) %>% nrow()
## [1] 3472
good_comedies <- filter(movies, rating > 9, Comedy==1)
dim(good_comedies) #171 movies
## [1] 171 25
# Let us say we only want highly rated comdies,
# which a lot of people have watched,
# made after year 2000.
head(movies %>% filter(rating >8, Comedy==1, votes > 100, year > 2000) )
## X title year length budget rating
## 1 5535 Big Fish 2003 125 70000000 8.1
## 2 7104 Bowling for Columbine 2002 120 4000000 8.5
## 3 10096 Cielo abierto, El 2001 110 NA 8.2
## 4 15356 Dzien swira 2002 93 NA 8.3
## 5 15612 Eierdiebe 2003 87 NA 8.1
## 6 16424 Eternal Sunshine of the Spotless Mind 2004 108 20000000 8.6
## votes r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 mpaa Action Animation
## 1 31525 4.5 4.5 4.5 4.5 4.5 4.5 14.5 24.5 24.5 24.5 PG-13 0 0
## 2 36747 4.5 4.5 4.5 4.5 4.5 4.5 4.5 14.5 24.5 34.5 R 0 0
## 3 191 4.5 4.5 4.5 4.5 4.5 4.5 14.5 24.5 24.5 24.5 0 0
## 4 349 4.5 4.5 4.5 4.5 4.5 4.5 4.5 14.5 24.5 44.5 0 0
## 5 130 4.5 4.5 0.0 4.5 4.5 4.5 14.5 14.5 14.5 34.5 0 0
## 6 46240 4.5 4.5 4.5 4.5 4.5 4.5 4.5 14.5 24.5 44.5 R 0 0
## Comedy Drama Documentary Romance Short
## 1 1 1 0 0 0
## 2 1 1 1 0 0
## 3 1 0 0 1 0
## 4 1 1 0 0 0
## 5 1 1 0 1 0
## 6 1 1 0 1 0
head(glimpse(good_comedies) )
## Observations: 171
## Variables: 25
## $ X <int> 59, 718, 2086, 2717, 2822, 3173, 3491, 3935, 3995,...
## $ title <chr> "+1 -1", "Abduction of Figaro", "America 101", "An...
## $ year <int> 1987, 1984, 2005, 1919, 1941, 2004, 1962, 1962, 20...
## $ length <int> 7, 144, 86, 60, 80, 109, 83, 61, 9, 85, 91, 90, 15...
## $ budget <int> NA, NA, NA, NA, NA, 3200000, NA, NA, NA, NA, NA, 5...
## $ rating <dbl> 9.4, 9.1, 9.5, 9.1, 9.4, 9.2, 9.1, 9.1, 9.3, 9.2, ...
## $ votes <int> 6, 22, 31, 19, 5, 23, 10, 8, 6, 8, 19, 11, 13, 6, ...
## $ r1 <dbl> 0.0, 0.0, 4.5, 14.5, 0.0, 4.5, 0.0, 0.0, 0.0, 0.0,...
## $ r2 <dbl> 0.0, 0.0, 4.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
## $ r3 <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 14.5, 0.0, 0.0,...
## $ r4 <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
## $ r5 <dbl> 0.0, 0.0, 0.0, 4.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
## $ r6 <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
## $ r7 <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 4.5, 24.5, 0.0, 14.5, 14....
## $ r8 <dbl> 0.0, 14.5, 4.5, 0.0, 24.5, 4.5, 0.0, 0.0, 0.0, 0.0...
## $ r9 <dbl> 45.5, 4.5, 4.5, 14.5, 24.5, 24.5, 14.5, 0.0, 14.5,...
## $ r10 <dbl> 45.5, 84.5, 74.5, 64.5, 64.5, 64.5, 74.5, 84.5, 64...
## $ mpaa <chr> "", "", "", "", "", "", "", "", "", "", "PG-13", "...
## $ Action <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Animation <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Comedy <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ Drama <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,...
## $ Documentary <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Romance <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ Short <int> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1,...
## X title year length budget rating votes r1 r2
## 1 59 +1 -1 1987 7 NA 9.4 6 0.0 0.0
## 2 718 Abduction of Figaro 1984 144 NA 9.1 22 0.0 0.0
## 3 2086 America 101 2005 86 NA 9.5 31 4.5 4.5
## 4 2717 Anne of Green Gables 1919 60 NA 9.1 19 14.5 0.0
## 5 2822 Anton Ivanovich serditsya 1941 80 NA 9.4 5 0.0 0.0
## 6 3173 Arrangement 2004 109 3200000 9.2 23 4.5 0.0
## r3 r4 r5 r6 r7 r8 r9 r10 mpaa Action Animation Comedy Drama
## 1 0 0 0.0 0 0.0 0.0 45.5 45.5 0 1 1 0
## 2 0 0 0.0 0 0.0 14.5 4.5 84.5 0 0 1 0
## 3 0 0 0.0 0 0.0 4.5 4.5 74.5 0 0 1 0
## 4 0 0 4.5 0 0.0 0.0 14.5 64.5 0 0 1 1
## 5 0 0 0.0 0 0.0 24.5 24.5 64.5 0 0 1 0
## 6 0 0 0.0 0 4.5 4.5 24.5 64.5 0 0 1 0
## Documentary Romance Short
## 1 0 0 1
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 1 0
head(good_comedies$title)
## [1] "+1 -1" "Abduction of Figaro"
## [3] "America 101" "Anne of Green Gables"
## [5] "Anton Ivanovich serditsya" "Arrangement"
S E L E C T
movies_df <- tbl_df(movies)
select(movies_df, title, year, rating) #Just the columns we want to see
## # A tibble: 58,788 × 3
## title year rating
## <chr> <int> <dbl>
## 1 $ 1971 6.4
## 2 $1000 a Touchdown 1939 6.0
## 3 $21 a Day Once a Month 1941 8.2
## 4 $40,000 1996 8.2
## 5 $50,000 Climax Show, The 1975 3.4
## 6 $pent 2000 4.3
## 7 $windle 2002 5.3
## 8 '15' 2002 6.7
## 9 '38 1987 6.6
## 10 '49-'17 1917 6.0
## # ... with 58,778 more rows
select(movies_df, -c(r1:r10)) #we don't want certain columns
## # A tibble: 58,788 × 15
## X title year length budget rating votes mpaa
## <int> <chr> <int> <int> <int> <dbl> <int> <chr>
## 1 1 $ 1971 121 NA 6.4 348
## 2 2 $1000 a Touchdown 1939 71 NA 6.0 20
## 3 3 $21 a Day Once a Month 1941 7 NA 8.2 5
## 4 4 $40,000 1996 70 NA 8.2 6
## 5 5 $50,000 Climax Show, The 1975 71 NA 3.4 17
## 6 6 $pent 2000 91 NA 4.3 45
## 7 7 $windle 2002 93 NA 5.3 200 R
## 8 8 '15' 2002 25 NA 6.7 24
## 9 9 '38 1987 97 NA 6.6 18
## 10 10 '49-'17 1917 61 NA 6.0 51
## # ... with 58,778 more rows, and 7 more variables: Action <int>,
## # Animation <int>, Comedy <int>, Drama <int>, Documentary <int>,
## # Romance <int>, Short <int>
select(movies_df, -c(budget, r1:r10, Animation, Documentary, Short, Romance)) #we don't want certain columns
## # A tibble: 58,788 × 10
## X title year length rating votes mpaa Action
## <int> <chr> <int> <int> <dbl> <int> <chr> <int>
## 1 1 $ 1971 121 6.4 348 0
## 2 2 $1000 a Touchdown 1939 71 6.0 20 0
## 3 3 $21 a Day Once a Month 1941 7 8.2 5 0
## 4 4 $40,000 1996 70 8.2 6 0
## 5 5 $50,000 Climax Show, The 1975 71 3.4 17 0
## 6 6 $pent 2000 91 4.3 45 0
## 7 7 $windle 2002 93 5.3 200 R 1
## 8 8 '15' 2002 25 6.7 24 0
## 9 9 '38 1987 97 6.6 18 0
## 10 10 '49-'17 1917 61 6.0 51 0
## # ... with 58,778 more rows, and 2 more variables: Comedy <int>,
## # Drama <int>
#more examples of Select
head(select(mtcars, wt, mpg) )
## wt mpg
## Mazda RX4 2.620 21.0
## Mazda RX4 Wag 2.875 21.0
## Datsun 710 2.320 22.8
## Hornet 4 Drive 3.215 21.4
## Hornet Sportabout 3.440 18.7
## Valiant 3.460 18.1
head(select(iris, Species, Sepal.Length) )
## Species Sepal.Length
## 1 setosa 5.1
## 2 setosa 4.9
## 3 setosa 4.7
## 4 setosa 4.6
## 5 setosa 5.0
## 6 setosa 5.4
head(select(iris, -Species) )
## Sepal.Length Sepal.Width Petal.Length Petal.Width id
## 1 5.1 3.5 1.4 0.2 1
## 2 4.9 3.0 1.4 0.2 2
## 3 4.7 3.2 1.3 0.2 3
## 4 4.6 3.1 1.5 0.2 4
## 5 5.0 3.6 1.4 0.2 5
## 6 5.4 3.9 1.7 0.4 6
head( select(iris, -c(Species, Petal.Length)) )
## Sepal.Length Sepal.Width Petal.Width id
## 1 5.1 3.5 0.2 1
## 2 4.9 3.0 0.2 2
## 3 4.7 3.2 0.2 3
## 4 4.6 3.1 0.2 4
## 5 5.0 3.6 0.2 5
## 6 5.4 3.9 0.4 6
#-----------------------------------------------
# MULTIPLE WAYS TO SELECT THE COLUMNS WE DESIRE
#-----------------------------------------------
# SELECT has many USEFUL FEATURES...
# Many Different way to select the columns we are interested in:
select(movies_df, title, year, budget, length) # Variables of interest
## # A tibble: 58,788 × 4
## title year budget length
## <chr> <int> <int> <int>
## 1 $ 1971 NA 121
## 2 $1000 a Touchdown 1939 NA 71
## 3 $21 a Day Once a Month 1941 NA 7
## 4 $40,000 1996 NA 70
## 5 $50,000 Climax Show, The 1975 NA 71
## 6 $pent 2000 NA 91
## 7 $windle 2002 NA 93
## 8 '15' 2002 NA 25
## 9 '38 1987 NA 97
## 10 '49-'17 1917 NA 61
## # ... with 58,778 more rows
# You can also select a range of columns from start:end
select(movies_df, title:votes) # All the columns from title to votes get selected
## # A tibble: 58,788 × 6
## title year length budget rating votes
## <chr> <int> <int> <int> <dbl> <int>
## 1 $ 1971 121 NA 6.4 348
## 2 $1000 a Touchdown 1939 71 NA 6.0 20
## 3 $21 a Day Once a Month 1941 7 NA 8.2 5
## 4 $40,000 1996 70 NA 8.2 6
## 5 $50,000 Climax Show, The 1975 71 NA 3.4 17
## 6 $pent 2000 91 NA 4.3 45
## 7 $windle 2002 93 NA 5.3 200
## 8 '15' 2002 25 NA 6.7 24
## 9 '38 1987 97 NA 6.6 18
## 10 '49-'17 1917 61 NA 6.0 51
## # ... with 58,778 more rows
# Now, let's say that we want only the Columns that contain movie ratings...
select(movies_df, contains("r")) # Any column that contains 'r' in its name
## # A tibble: 58,788 × 16
## year rating r1 r2 r3 r4 r5 r6 r7 r8 r9
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1971 6.4 4.5 4.5 4.5 4.5 14.5 24.5 24.5 14.5 4.5
## 2 1939 6.0 0.0 14.5 4.5 24.5 14.5 14.5 14.5 4.5 4.5
## 3 1941 8.2 0.0 0.0 0.0 0.0 0.0 24.5 0.0 44.5 24.5
## 4 1996 8.2 14.5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 34.5
## 5 1975 3.4 24.5 4.5 0.0 14.5 14.5 4.5 0.0 0.0 0.0
## 6 2000 4.3 4.5 4.5 4.5 14.5 14.5 14.5 4.5 4.5 14.5
## 7 2002 5.3 4.5 0.0 4.5 4.5 24.5 24.5 14.5 4.5 4.5
## 8 2002 6.7 4.5 4.5 4.5 4.5 4.5 14.5 14.5 14.5 4.5
## 9 1987 6.6 4.5 4.5 4.5 0.0 0.0 0.0 34.5 14.5 4.5
## 10 1917 6.0 4.5 0.0 4.5 4.5 4.5 44.5 14.5 4.5 4.5
## # ... with 58,778 more rows, and 5 more variables: r10 <dbl>, Drama <int>,
## # Documentary <int>, Romance <int>, Short <int>
select(movies_df, ends_with("t")) # All vars ending with "test".
## # A tibble: 58,788 × 2
## budget Short
## <int> <int>
## 1 NA 0
## 2 NA 0
## 3 NA 1
## 4 NA 0
## 5 NA 0
## 6 NA 0
## 7 NA 0
## 8 NA 1
## 9 NA 0
## 10 NA 0
## # ... with 58,778 more rows
select(movies_df, starts_with("r")) # Gets all vars staring with "R".
## # A tibble: 58,788 × 12
## rating r1 r2 r3 r4 r5 r6 r7 r8 r9 r10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6.4 4.5 4.5 4.5 4.5 14.5 24.5 24.5 14.5 4.5 4.5
## 2 6.0 0.0 14.5 4.5 24.5 14.5 14.5 14.5 4.5 4.5 14.5
## 3 8.2 0.0 0.0 0.0 0.0 0.0 24.5 0.0 44.5 24.5 24.5
## 4 8.2 14.5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 34.5 45.5
## 5 3.4 24.5 4.5 0.0 14.5 14.5 4.5 0.0 0.0 0.0 24.5
## 6 4.3 4.5 4.5 4.5 14.5 14.5 14.5 4.5 4.5 14.5 14.5
## 7 5.3 4.5 0.0 4.5 4.5 24.5 24.5 14.5 4.5 4.5 14.5
## 8 6.7 4.5 4.5 4.5 4.5 4.5 14.5 14.5 14.5 4.5 14.5
## 9 6.6 4.5 4.5 4.5 0.0 0.0 0.0 34.5 14.5 4.5 24.5
## 10 6.0 4.5 0.0 4.5 4.5 4.5 44.5 14.5 4.5 4.5 4.5
## # ... with 58,778 more rows, and 1 more variables: Romance <int>
# The above is not quite what we want. We don't want the Romance column
select(movies_df, matches("r[0-9]")) # Columns that match a regex.
## # A tibble: 58,788 × 10
## r1 r2 r3 r4 r5 r6 r7 r8 r9 r10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4.5 4.5 4.5 4.5 14.5 24.5 24.5 14.5 4.5 4.5
## 2 0.0 14.5 4.5 24.5 14.5 14.5 14.5 4.5 4.5 14.5
## 3 0.0 0.0 0.0 0.0 0.0 24.5 0.0 44.5 24.5 24.5
## 4 14.5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 34.5 45.5
## 5 24.5 4.5 0.0 14.5 14.5 4.5 0.0 0.0 0.0 24.5
## 6 4.5 4.5 4.5 14.5 14.5 14.5 4.5 4.5 14.5 14.5
## 7 4.5 0.0 4.5 4.5 24.5 24.5 14.5 4.5 4.5 14.5
## 8 4.5 4.5 4.5 4.5 4.5 14.5 14.5 14.5 4.5 14.5
## 9 4.5 4.5 4.5 0.0 0.0 0.0 34.5 14.5 4.5 24.5
## 10 4.5 0.0 4.5 4.5 4.5 44.5 14.5 4.5 4.5 4.5
## # ... with 58,778 more rows
select(movies_df, num_range("r", 1:10)) # q1 thru q4 regardless of location.
## # A tibble: 58,788 × 10
## r1 r2 r3 r4 r5 r6 r7 r8 r9 r10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4.5 4.5 4.5 4.5 14.5 24.5 24.5 14.5 4.5 4.5
## 2 0.0 14.5 4.5 24.5 14.5 14.5 14.5 4.5 4.5 14.5
## 3 0.0 0.0 0.0 0.0 0.0 24.5 0.0 44.5 24.5 24.5
## 4 14.5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 34.5 45.5
## 5 24.5 4.5 0.0 14.5 14.5 4.5 0.0 0.0 0.0 24.5
## 6 4.5 4.5 4.5 14.5 14.5 14.5 4.5 4.5 14.5 14.5
## 7 4.5 0.0 4.5 4.5 24.5 24.5 14.5 4.5 4.5 14.5
## 8 4.5 4.5 4.5 4.5 4.5 14.5 14.5 14.5 4.5 14.5
## 9 4.5 4.5 4.5 0.0 0.0 0.0 34.5 14.5 4.5 24.5
## 10 4.5 0.0 4.5 4.5 4.5 44.5 14.5 4.5 4.5 4.5
## # ... with 58,778 more rows
# COMBINING FILTER WITH SELECT
# How to list movies that have a rating of GT 9.2 and GT 10000 votes?
movies %>%
filter(rating>=9.1 , votes > 1000) %>%
select(title, votes, rating)
## title votes rating
## 1 Godfather, The 122755 9.1
## 2 Shawshank Redemption, The 149494 9.1
ARRANGE - REORDER (SORT) ROWS
movies_df <- tbl_df(movies)
arrange(movies_df, rating) #but this is not what we want
## # A tibble: 58,788 × 25
## X title year length budget rating votes r1
## <int> <chr> <int> <int> <int> <dbl> <int> <dbl>
## 1 274 20/20 Vision 1999 20 NA 1 5 64.5
## 2 2139 American Flatulators 1995 3 NA 1 5 100.0
## 3 3851 Babo 73 1964 57 NA 1 7 84.5
## 4 4142 Bak syv hav 1991 87 NA 1 85 84.5
## 5 4231 Balls Bluff 1961 24 NA 1 11 84.5
## 6 4994 Bell, Bare and Beautiful 1963 64 NA 1 20 64.5
## 7 5407 Beyond the Wall of Sleep 2004 84 NA 1 16 45.5
## 8 5963 Black Love 1972 70 NA 1 8 74.5
## 9 6013 Black Sherlock Holmes, A 1918 12 NA 1 6 100.0
## 10 6096 Blair Fish Project, The 1999 10 NA 1 14 74.5
## # ... with 58,778 more rows, and 17 more variables: r2 <dbl>, r3 <dbl>,
## # r4 <dbl>, r5 <dbl>, r6 <dbl>, r7 <dbl>, r8 <dbl>, r9 <dbl>, r10 <dbl>,
## # mpaa <chr>, Action <int>, Animation <int>, Comedy <int>, Drama <int>,
## # Documentary <int>, Romance <int>, Short <int>
arrange(movies_df, desc(rating))
## # A tibble: 58,788 × 25
## X title year length budget
## <int> <chr> <int> <int> <int>
## 1 13908 Dimensia Minds Trilogy: The Hope Factor 2004 10 NA
## 2 18016 Fishing for Love 2001 7 NA
## 3 49846 Summer Sonata, A 2004 30 2000
## 4 5898 Black Canyon 2004 25 3000
## 5 7711 Buck Privates 1928 70 NA
## 6 13171 Defilada 1989 60 NA
## 7 13909 Dimensia Minds Trilogy: The Reds 2004 8 1200
## 8 15019 Drifting 2004 21 3000
## 9 15659 Ekdromi, I 1966 85 NA
## 10 19826 Genet parle d'Angela Davis 1970 10 NA
## # ... with 58,778 more rows, and 20 more variables: rating <dbl>,
## # votes <int>, r1 <dbl>, r2 <dbl>, r3 <dbl>, r4 <dbl>, r5 <dbl>,
## # r6 <dbl>, r7 <dbl>, r8 <dbl>, r9 <dbl>, r10 <dbl>, mpaa <chr>,
## # Action <int>, Animation <int>, Comedy <int>, Drama <int>,
## # Documentary <int>, Romance <int>, Short <int>
arrange(movies_df, desc(rating), desc(year))
## # A tibble: 58,788 × 25
## X title year length budget
## <int> <chr> <int> <int> <int>
## 1 13908 Dimensia Minds Trilogy: The Hope Factor 2004 10 NA
## 2 49846 Summer Sonata, A 2004 30 2000
## 3 18016 Fishing for Love 2001 7 NA
## 4 27510 Keeper of the Past 2005 18 30000
## 5 5898 Black Canyon 2004 25 3000
## 6 13909 Dimensia Minds Trilogy: The Reds 2004 8 1200
## 7 15019 Drifting 2004 21 3000
## 8 37399 Of Age 2004 5 1000
## 9 39498 Penis Envy 2004 8 NA
## 10 40458 Plight of Clownana, The 2004 15 6000
## # ... with 58,778 more rows, and 20 more variables: rating <dbl>,
## # votes <int>, r1 <dbl>, r2 <dbl>, r3 <dbl>, r4 <dbl>, r5 <dbl>,
## # r6 <dbl>, r7 <dbl>, r8 <dbl>, r9 <dbl>, r10 <dbl>, mpaa <chr>,
## # Action <int>, Animation <int>, Comedy <int>, Drama <int>,
## # Documentary <int>, Romance <int>, Short <int>
# Notice the difference, when the year and rating are switched...
arrange(movies_df, desc(year), desc(rating))
## # A tibble: 58,788 × 25
## X title year length budget rating votes r1
## <int> <chr> <int> <int> <int> <dbl> <int> <dbl>
## 1 27510 Keeper of the Past 2005 18 30000 9.9 7 0
## 2 20878 Goodnite Charlie 2005 119 100000 9.8 34 0
## 3 37176 Nun Fu 2005 5 5000 9.8 5 0
## 4 37262 Oath, The 2005 23 NA 9.8 5 0
## 5 56370 Weg ist das Spiel, Der 2005 3 NA 9.8 8 0
## 6 34472 Morphin(e) 2005 20 8000 9.7 7 0
## 7 6133 Blaze Orange 2005 16 0 9.6 7 0
## 8 17759 Filmic Achievement 2005 80 NA 9.6 10 0
## 9 20875 Goodnight Bill 2005 19 NA 9.6 7 0
## 10 57067 Wild Girls Gone 2005 93 NA 9.6 7 0
## # ... with 58,778 more rows, and 17 more variables: r2 <dbl>, r3 <dbl>,
## # r4 <dbl>, r5 <dbl>, r6 <dbl>, r7 <dbl>, r8 <dbl>, r9 <dbl>, r10 <dbl>,
## # mpaa <chr>, Action <int>, Animation <int>, Comedy <int>, Drama <int>,
## # Documentary <int>, Romance <int>, Short <int>
#-----------------------------------
#Another example using mtcars
names(mtcars)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
head( arrange(mtcars, mpg) )
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 10.4 8 472 205 2.93 5.250 17.98 0 0 3 4
## 2 10.4 8 460 215 3.00 5.424 17.82 0 0 3 4
## 3 13.3 8 350 245 3.73 3.840 15.41 0 0 3 4
## 4 14.3 8 360 245 3.21 3.570 15.84 0 0 3 4
## 5 14.7 8 440 230 3.23 5.345 17.42 0 0 3 4
## 6 15.0 8 301 335 3.54 3.570 14.60 0 1 5 8
head(arrange(mtcars, desc(mpg)) )
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 2 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 3 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 4 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 5 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 6 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
MUTATE
#Creates a new column
head(mutate(iris, aspect_ratio = Petal.Width/Petal.Length) )
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species id
## 1 5.1 3.5 1.4 0.2 setosa 1
## 2 4.9 3.0 1.4 0.2 setosa 2
## 3 4.7 3.2 1.3 0.2 setosa 3
## 4 4.6 3.1 1.5 0.2 setosa 4
## 5 5.0 3.6 1.4 0.2 setosa 5
## 6 5.4 3.9 1.7 0.4 setosa 6
## aspect_ratio
## 1 0.1428571
## 2 0.1428571
## 3 0.1538462
## 4 0.1333333
## 5 0.1428571
## 6 0.2352941
movies_with_budgets <- filter(movies_df, !is.na(budget))
mutate(movies_with_budgets, costPerMinute = budget/length) %>%
select(title, costPerMinute)
## # A tibble: 5,215 × 2
## title costPerMinute
## <chr> <dbl>
## 1 'G' Men 5294.1176
## 2 'Manos' the Hands of Fate 256.7568
## 3 'Til There Was You 203539.8230
## 4 .com for Murder 52083.3333
## 5 10 Things I Hate About You 164948.4536
## 6 100 Mile Rule 11224.4898
## 7 100 Proof 1489.3617
## 8 101 1709.4017
## 9 101-vy kilometer 1941.7476
## 10 102 Dalmatians 850000.0000
## # ... with 5,205 more rows
GROUP_BY AND SUMMARISE
# This seems how roup_by works...
by_rating <- group_by(movies_df, rating)
head(movies_df)
## # A tibble: 6 × 25
## X title year length budget rating votes r1
## <int> <chr> <int> <int> <int> <dbl> <int> <dbl>
## 1 1 $ 1971 121 NA 6.4 348 4.5
## 2 2 $1000 a Touchdown 1939 71 NA 6.0 20 0.0
## 3 3 $21 a Day Once a Month 1941 7 NA 8.2 5 0.0
## 4 4 $40,000 1996 70 NA 8.2 6 14.5
## 5 5 $50,000 Climax Show, The 1975 71 NA 3.4 17 24.5
## 6 6 $pent 2000 91 NA 4.3 45 4.5
## # ... with 17 more variables: r2 <dbl>, r3 <dbl>, r4 <dbl>, r5 <dbl>,
## # r6 <dbl>, r7 <dbl>, r8 <dbl>, r9 <dbl>, r10 <dbl>, mpaa <chr>,
## # Action <int>, Animation <int>, Comedy <int>, Drama <int>,
## # Documentary <int>, Romance <int>, Short <int>
by_rating
## Source: local data frame [58,788 x 25]
## Groups: rating [91]
##
## X title year length budget rating votes r1
## <int> <chr> <int> <int> <int> <dbl> <int> <dbl>
## 1 1 $ 1971 121 NA 6.4 348 4.5
## 2 2 $1000 a Touchdown 1939 71 NA 6.0 20 0.0
## 3 3 $21 a Day Once a Month 1941 7 NA 8.2 5 0.0
## 4 4 $40,000 1996 70 NA 8.2 6 14.5
## 5 5 $50,000 Climax Show, The 1975 71 NA 3.4 17 24.5
## 6 6 $pent 2000 91 NA 4.3 45 4.5
## 7 7 $windle 2002 93 NA 5.3 200 4.5
## 8 8 '15' 2002 25 NA 6.7 24 4.5
## 9 9 '38 1987 97 NA 6.6 18 4.5
## 10 10 '49-'17 1917 61 NA 6.0 51 4.5
## # ... with 58,778 more rows, and 17 more variables: r2 <dbl>, r3 <dbl>,
## # r4 <dbl>, r5 <dbl>, r6 <dbl>, r7 <dbl>, r8 <dbl>, r9 <dbl>, r10 <dbl>,
## # mpaa <chr>, Action <int>, Animation <int>, Comedy <int>, Drama <int>,
## # Documentary <int>, Romance <int>, Short <int>
by_year <- group_by(movies_df, year)
by_year
## Source: local data frame [58,788 x 25]
## Groups: year [113]
##
## X title year length budget rating votes r1
## <int> <chr> <int> <int> <int> <dbl> <int> <dbl>
## 1 1 $ 1971 121 NA 6.4 348 4.5
## 2 2 $1000 a Touchdown 1939 71 NA 6.0 20 0.0
## 3 3 $21 a Day Once a Month 1941 7 NA 8.2 5 0.0
## 4 4 $40,000 1996 70 NA 8.2 6 14.5
## 5 5 $50,000 Climax Show, The 1975 71 NA 3.4 17 24.5
## 6 6 $pent 2000 91 NA 4.3 45 4.5
## 7 7 $windle 2002 93 NA 5.3 200 4.5
## 8 8 '15' 2002 25 NA 6.7 24 4.5
## 9 9 '38 1987 97 NA 6.6 18 4.5
## 10 10 '49-'17 1917 61 NA 6.0 51 4.5
## # ... with 58,778 more rows, and 17 more variables: r2 <dbl>, r3 <dbl>,
## # r4 <dbl>, r5 <dbl>, r6 <dbl>, r7 <dbl>, r8 <dbl>, r9 <dbl>, r10 <dbl>,
## # mpaa <chr>, Action <int>, Animation <int>, Comedy <int>, Drama <int>,
## # Documentary <int>, Romance <int>, Short <int>
# Let's see the histogram of movies by rating, by year
head(by_rating %>% summarize(n()) )
## # A tibble: 6 × 2
## rating `n()`
## <dbl> <int>
## 1 1.0 106
## 2 1.1 44
## 3 1.2 36
## 4 1.3 37
## 5 1.4 49
## 6 1.5 59
by_year %>% summarize(avg_rating = mean(rating))
## # A tibble: 113 × 2
## year avg_rating
## <int> <dbl>
## 1 1893 7.000000
## 2 1894 4.888889
## 3 1895 5.500000
## 4 1896 5.269231
## 5 1897 4.677778
## 6 1898 5.040000
## 7 1899 4.277778
## 8 1900 4.731250
## 9 1901 4.682143
## 10 1902 4.900000
## # ... with 103 more rows
head( summarise(by_rating,
num_movies_by_rating = n(),
average_length = mean(length))
)
## # A tibble: 6 × 3
## rating num_movies_by_rating average_length
## <dbl> <int> <dbl>
## 1 1.0 106 64.91509
## 2 1.1 44 70.65909
## 3 1.2 36 66.47222
## 4 1.3 37 73.67568
## 5 1.4 49 65.87755
## 6 1.5 59 74.47458
movies %>%
group_by(year) %>%
summarize(MoviesPerYear=n())
## # A tibble: 113 × 2
## year MoviesPerYear
## <int> <int>
## 1 1893 1
## 2 1894 9
## 3 1895 3
## 4 1896 13
## 5 1897 9
## 6 1898 5
## 7 1899 9
## 8 1900 16
## 9 1901 28
## 10 1902 9
## # ... with 103 more rows
avg_rating_by_year <-
group_by(movies_df, year) %>%
summarize(avg_rating = mean(rating))
avg_rating_by_year
## # A tibble: 113 × 2
## year avg_rating
## <int> <dbl>
## 1 1893 7.000000
## 2 1894 4.888889
## 3 1895 5.500000
## 4 1896 5.269231
## 5 1897 4.677778
## 6 1898 5.040000
## 7 1899 4.277778
## 8 1900 4.731250
## 9 1901 4.682143
## 10 1902 4.900000
## # ... with 103 more rows
## GROUP_BY EXAMPLE #2
titanic <- as.data.frame(Titanic)
titanic #flattened table... with 32 rows and 5 columns
## Class Sex Age Survived Freq
## 1 1st Male Child No 0
## 2 2nd Male Child No 0
## 3 3rd Male Child No 35
## 4 Crew Male Child No 0
## 5 1st Female Child No 0
## 6 2nd Female Child No 0
## 7 3rd Female Child No 17
## 8 Crew Female Child No 0
## 9 1st Male Adult No 118
## 10 2nd Male Adult No 154
## 11 3rd Male Adult No 387
## 12 Crew Male Adult No 670
## 13 1st Female Adult No 4
## 14 2nd Female Adult No 13
## 15 3rd Female Adult No 89
## 16 Crew Female Adult No 3
## 17 1st Male Child Yes 5
## 18 2nd Male Child Yes 11
## 19 3rd Male Child Yes 13
## 20 Crew Male Child Yes 0
## 21 1st Female Child Yes 1
## 22 2nd Female Child Yes 13
## 23 3rd Female Child Yes 14
## 24 Crew Female Child Yes 0
## 25 1st Male Adult Yes 57
## 26 2nd Male Adult Yes 14
## 27 3rd Male Adult Yes 75
## 28 Crew Male Adult Yes 192
## 29 1st Female Adult Yes 140
## 30 2nd Female Adult Yes 80
## 31 3rd Female Adult Yes 76
## 32 Crew Female Adult Yes 20
names(titanic) #Freq is the column that shows number of people
## [1] "Class" "Sex" "Age" "Survived" "Freq"
# How many people in each class in Titanic?
titanic %>%
group_by(Class) %>%
summarize(TotalByClass = sum(Freq))
## # A tibble: 4 × 2
## Class TotalByClass
## <fctr> <dbl>
## 1 1st 325
## 2 2nd 285
## 3 3rd 706
## 4 Crew 885
titanic %>% group_by(Class, Survived) %>% summarize(TotalByClassBySurvived = sum(Freq))
## Source: local data frame [8 x 3]
## Groups: Class [?]
##
## Class Survived TotalByClassBySurvived
## <fctr> <fctr> <dbl>
## 1 1st No 122
## 2 1st Yes 203
## 3 2nd No 167
## 4 2nd Yes 118
## 5 3rd No 528
## 6 3rd Yes 178
## 7 Crew No 673
## 8 Crew Yes 212
titanic %>% group_by(Sex, Survived) %>% summarize(TotalByGenderBySurvived = sum(Freq))
## Source: local data frame [4 x 3]
## Groups: Sex [?]
##
## Sex Survived TotalByGenderBySurvived
## <fctr> <fctr> <dbl>
## 1 Male No 1364
## 2 Male Yes 367
## 3 Female No 126
## 4 Female Yes 344
titanic %>% group_by(Class, Sex, Survived) %>% summarize(GroupTotal = sum(Freq))
## Source: local data frame [16 x 4]
## Groups: Class, Sex [?]
##
## Class Sex Survived GroupTotal
## <fctr> <fctr> <fctr> <dbl>
## 1 1st Male No 118
## 2 1st Male Yes 62
## 3 1st Female No 4
## 4 1st Female Yes 141
## 5 2nd Male No 154
## 6 2nd Male Yes 25
## 7 2nd Female No 13
## 8 2nd Female Yes 93
## 9 3rd Male No 422
## 10 3rd Male Yes 88
## 11 3rd Female No 106
## 12 3rd Female Yes 90
## 13 Crew Male No 670
## 14 Crew Male Yes 192
## 15 Crew Female No 3
## 16 Crew Female Yes 20
#----------------------------------------------------------
#Example of grouping by TWO variables...
mtcars %>%
group_by(cyl, gear) %>%
summarize(datapoints = n())
## Source: local data frame [8 x 3]
## Groups: cyl [?]
##
## cyl gear datapoints
## <dbl> <fctr> <int>
## 1 4 3 1
## 2 4 4 8
## 3 4 5 2
## 4 6 3 2
## 5 6 4 4
## 6 6 5 1
## 7 8 3 12
## 8 8 5 2
head(
mtcars %>%
group_by(cyl, gear) %>%
mutate(cyl_gear_count = n()) #Mutate adds a new variable. n() is the count for that group.
)
## Source: local data frame [6 x 12]
## Groups: cyl, gear [4]
##
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fctr> <dbl>
## 1 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## 2 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## 3 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## 4 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## 5 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## 6 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
## # ... with 1 more variables: cyl_gear_count <int>
# Advanced - multiple summarize
mtcars %>%
group_by(cyl, gear) %>%
summarize(datapoints = n()) %>%
summarize(datapoints = n())
## # A tibble: 3 × 2
## cyl datapoints
## <dbl> <int>
## 1 4 3
## 2 6 3
## 3 8 2
# Notice that there are two summrizes here.
# "When you group by multiple variables,
# each summary peels off one level of the grouping."
# Therfore, grouping by gear is gone after the second summarize.
## EXTRAS
# Chaining...
producers_nightmare <- filter(movies_df, !is.na(budget), year>1990) %>%
mutate(costPerMinute = budget/length) %>%
arrange(desc(costPerMinute)) %>%
select(title, year, costPerMinute)
producers_nightmare
## # A tibble: 3,120 × 3
## title year costPerMinute
## <chr> <int> <dbl>
## 1 Star Trek the Experience: The Klingon Encounter 1998 17500000
## 2 Back to the Future... The Ride 1991 10000000
## 3 T2 3-D: Battle Across Time 1996 5000000
## 4 Tarzan 1999 1704545
## 5 Terminator 3: Rise of the Machines 2003 1605505
## 6 Men in Black II 2002 1590909
## 7 Wild Wild West 1999 1588785
## 8 Spider-Man 2 2004 1574803
## 9 Dinosaur 2000 1554878
## 10 Polar Express, The 2004 1515152
## # ... with 3,110 more rows
## PLOTTING
head(
movies %>%
group_by(rating) %>%
summarize(n())
)
## # A tibble: 6 × 2
## rating `n()`
## <dbl> <int>
## 1 1.0 106
## 2 1.1 44
## 3 1.2 36
## 4 1.3 37
## 5 1.4 49
## 6 1.5 59
# you can even pipe it to Plot...
# movies %>%
# group_by(rating) %>%
# summarize(n()) %>%
# plot() # plots the histogram of movies by Each value of rating
# movies %>%
# group_by(year) %>%
# summarise(y=mean(rating)) %>%
# with(barplot(y, names.arg=year, main="AVG IMDB Rating by Year"))
#----------------------------------------------------------
# Extras
# EXAMPLE of GROUP_BY (tapply in base R does the same thing.)
# What if we wanted to calculate the average rating for each mpaa rating?
# Are PG movies rated higher than R rated movies?
movies %>%
filter(mpaa != "", length>80) %>%
group_by(mpaa) %>%
summarize(mean(rating))
## # A tibble: 4 × 2
## mpaa `mean(rating)`
## <chr> <dbl>
## 1 NC-17 5.362500
## 2 PG 5.636456
## 3 PG-13 5.796939
## 4 R 5.445374
#compare against
tapply(movies$rating, movies$mpaa, mean)
## NC-17 PG PG-13 R
## 5.970537 5.362500 5.605492 5.800698 5.424874
#Try this yourself first:
#Can you print the top 2 movies (by rating) for each year?
#SPOLIER ALERT:
#SHOW THE TOP 2 MOVIES for each year
movies %>%
group_by(year) %>%
filter(year>2000) %>%
arrange(year, desc(rating)) %>%
filter(row_number(year) <=2) %>%
select(year, rating, title)
## Source: local data frame [10 x 3]
## Groups: year [5]
##
## year rating title
## <int> <dbl> <chr>
## 1 2001 10.0 Fishing for Love
## 2 2001 9.8 Martin Four
## 3 2002 9.8 Copi, je t'aime
## 4 2002 9.7 Thanks for Nothing
## 5 2003 9.8 I Am My Resume
## 6 2003 9.8 TankUp.US
## 7 2004 10.0 Dimensia Minds Trilogy: The Hope Factor
## 8 2004 10.0 Summer Sonata, A
## 9 2005 9.9 Keeper of the Past
## 10 2005 9.8 Goodnite Charlie
Pipe into Plot
names(movies)
## [1] "X" "title" "year" "length" "budget"
## [6] "rating" "votes" "r1" "r2" "r3"
## [11] "r4" "r5" "r6" "r7" "r8"
## [16] "r9" "r10" "mpaa" "Action" "Animation"
## [21] "Comedy" "Drama" "Documentary" "Romance" "Short"
head(movies)
## X title year length budget rating votes r1 r2 r3
## 1 1 $ 1971 121 NA 6.4 348 4.5 4.5 4.5
## 2 2 $1000 a Touchdown 1939 71 NA 6.0 20 0.0 14.5 4.5
## 3 3 $21 a Day Once a Month 1941 7 NA 8.2 5 0.0 0.0 0.0
## 4 4 $40,000 1996 70 NA 8.2 6 14.5 0.0 0.0
## 5 5 $50,000 Climax Show, The 1975 71 NA 3.4 17 24.5 4.5 0.0
## 6 6 $pent 2000 91 NA 4.3 45 4.5 4.5 4.5
## r4 r5 r6 r7 r8 r9 r10 mpaa Action Animation Comedy Drama
## 1 4.5 14.5 24.5 24.5 14.5 4.5 4.5 0 0 1 1
## 2 24.5 14.5 14.5 14.5 4.5 4.5 14.5 0 0 1 0
## 3 0.0 0.0 24.5 0.0 44.5 24.5 24.5 0 1 0 0
## 4 0.0 0.0 0.0 0.0 0.0 34.5 45.5 0 0 1 0
## 5 14.5 14.5 4.5 0.0 0.0 0.0 24.5 0 0 0 0
## 6 14.5 14.5 14.5 4.5 4.5 14.5 14.5 0 0 0 1
## Documentary Romance Short
## 1 0 0 0
## 2 0 0 0
## 3 0 0 1
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
df1 <- movies %>% group_by(rating) %>% summarize(n())
head(df1)
## # A tibble: 6 × 2
## rating `n()`
## <dbl> <int>
## 1 1.0 106
## 2 1.1 44
## 3 1.2 36
## 4 1.3 37
## 5 1.4 49
## 6 1.5 59
df1 %>% plot()

# plots the histogram of movies by Each value of rating!
movies %>%
group_by(year) %>%
summarise(y=mean(rating)) %>%
with(barplot(y, names.arg=year, main="AVG IMDB Rating by Year"))
