Numbers

Not a Number

library(ggplot2)

# Four Reserved Words in R (NA, NaN, Inf & NULL)
# â¢ NA      # missing!
# â¢ NULL    # undefined!
# â¢ Inf     # infinite 3/0!
# â¢ NaN     # Not a number Inf/Inf!

# 1. Not a Number

# Inf/Inf #one way to create NaN
x <- 4/0
x/x

## [1] NaN

class(x)

## [1] "numeric"

class(x/x) #Note the type. Not a number is also of type numeric

## [1] "numeric"

x <- 4/0
y<- x/x
is.nan(y) #This is how you detect a NaN

## [1] TRUE

# 2. Understanding NA's
x <- c(3,4, NA, 6)
x

## [1]  3  4 NA  6

x[3]

## [1] NA

class(x[3]) #numeric

## [1] "numeric"

is.na(x)

## [1] FALSE FALSE  TRUE FALSE

na.omit(x)

## [1] 3 4 6
## attr(,"na.action")
## [1] 3
## attr(,"class")
## [1] "omit"

y <- c(TRUE, NA, FALSE, TRUE)
class(y[2])

## [1] "logical"

## DEALING WITH NA's
airquality$Ozone # make sure that we do see some NA's

##   [1]  41  36  12  18  NA  28  23  19   8  NA   7  16  11  14  18  14  34
##  [18]   6  30  11   1  11   4  32  NA  NA  NA  23  45 115  37  NA  NA  NA
##  [35]  NA  NA  NA  29  NA  71  39  NA  NA  23  NA  NA  21  37  20  12  13
##  [52]  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA 135  49  32  NA  64  40  77
##  [69]  97  97  85  NA  10  27  NA   7  48  35  61  79  63  16  NA  NA  80
##  [86] 108  20  52  82  50  64  59  39   9  16  78  35  66 122  89 110  NA
## [103]  NA  44  28  65  NA  22  59  23  31  44  21   9  NA  45 168  73  NA
## [120]  76 118  84  85  96  78  73  91  47  32  20  23  21  24  44  21  28
## [137]   9  13  46  18  13  24  16  13  23  36   7  14  30  NA  14  18  20

# First, run is.na() on the column.
is.na(airquality$Ozone)

##   [1] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE
##  [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [23] FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE
##  [34]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE
##  [45]  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE
##  [56]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE
##  [67] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE
##  [78] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE
##  [89] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [100] FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE
## [122] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [144] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE

# How many Ozone values are NA?
sum(is.na(airquality$Ozone))

## [1] 37

# Shortcut. Simply use summary()
summary(airquality) #Notice that Summary tells us how many NA's there are

##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0  
##

# Print ROWS that DON'T have an NA for Ozone
head(airquality[!is.na(airquality$Ozone) , ] )

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 6    28      NA 14.9   66     5   6
## 7    23     299  8.6   65     5   7

# But wait, airquality$Solar.R has NA's too.
head(na.omit(airquality) )

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 7    23     299  8.6   65     5   7
## 8    19      99 13.8   59     5   8

# USING COMPLETE CASES
head( airquality[complete.cases(airquality), ] )

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 7    23     299  8.6   65     5   7
## 8    19      99 13.8   59     5   8

#Finally, let's convert these NA's to 0s
#Note this is not always a good idea. 0s are very different from NA's
tf <- is.na(airquality$Solar.R) # TRUE FALSE conditional vector
head(tf)

## [1] FALSE FALSE FALSE FALSE  TRUE  TRUE

airquality$Solar.R[tf] <- 0
summary(airquality)

##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  0.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.: 95.0   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :194.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :177.4   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:256.0   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37                                                       
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0  
##

DEDUPING - Removing DUPLICATED VALUES

head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

head(duplicated(iris) )

## [1] FALSE FALSE FALSE FALSE FALSE FALSE

iris[143,]

##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 143          5.8         2.7          5.1         1.9 virginica

iris[with(iris, Sepal.Length==5.8),]

##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 15           5.8         4.0          1.2         0.2     setosa
## 68           5.8         2.7          4.1         1.0 versicolor
## 83           5.8         2.7          3.9         1.2 versicolor
## 93           5.8         2.6          4.0         1.2 versicolor
## 102          5.8         2.7          5.1         1.9  virginica
## 115          5.8         2.8          5.1         2.4  virginica
## 143          5.8         2.7          5.1         1.9  virginica

with(iris, Sepal.Length==5.8)

##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [12] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [67] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [78] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
##  [89] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [100] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [122] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [144] FALSE FALSE FALSE FALSE FALSE FALSE FALSE

Convert strings to numbers

as.numeric("18.3")

## [1] 18.3

as.integer("18")

## [1] 18

as.integer("18.9") #rounds it down to an integer

## [1] 18

as.numeric("18a")

## Warning: NAs introduced by coercion

## [1] NA

#Extract Numbers from Strings
x <-  '8 23 37 38 20 40 39 41 21 33'
class(x)

## [1] "character"

tokens <- strsplit(x, " ")
str(tokens)

## List of 1
##  $ : chr [1:10] "8" "23" "37" "38" ...

vec <- unlist(tokens)
str(vec)

##  chr [1:10] "8" "23" "37" "38" "20" "40" "39" "41" ...

nums <- as.numeric(vec)
str(nums)

##  num [1:10] 8 23 37 38 20 40 39 41 21 33

# or in one line - efficient perhaps, but less readable
nums <- as.numeric(unlist(strsplit(x," ")))
str(nums)

##  num [1:10] 8 23 37 38 20 40 39 41 21 33

factors in R

class(iris$Species)

## [1] "factor"

iris$Species[1:5] #notice that all Levels are listed

## [1] setosa setosa setosa setosa setosa
## Levels: setosa versicolor virginica

str(mtcars)

## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...

#Let's make the "gear" column into a factor
mtcars$gear

##  [1] 4 4 4 3 3 3 3 4 4 4 4 3 3 3 3 3 3 4 4 4 3 3 3 3 3 4 5 5 5 5 5 4

mtcars$gear <- as.factor(mtcars$gear)
str(mtcars$gear)

##  Factor w/ 3 levels "3","4","5": 2 2 2 1 1 1 1 2 2 2 ...

#Formula Syntax in R
m <- mtcars$mpg ~ mtcars$cyl + mtcars$gear
str(m)

## Class 'formula'  language mtcars$mpg ~ mtcars$cyl + mtcars$gear
##   ..- attr(*, ".Environment")=<environment: R_GlobalEnv>

Data Manipulation

USE of SUBSET() COMMAND - seesm like “SELECT” of SQL

# Format: subset(dataframe, condition)
# Note: movies is present in ggplot2
# To keep only the R-rated movies
movies <- read.csv("c:/AGZ1/GD_AGZ1117/AGZ_Home/workspace_R/data/movies.csv",  stringsAsFactors = FALSE)
head(subset(movies, mpaa=="R") )

##       X         title year length  budget rating votes   r1   r2  r3   r4
## 7     7       $windle 2002     93      NA    5.3   200  4.5  0.0 4.5  4.5
## 43   43       'R Xmas 2001     83      NA    4.9   288 14.5  4.5 4.5  4.5
## 123 123     100 Girls 2000     90      NA    5.8  3349  4.5  4.5 4.5  4.5
## 124 124 100 Mile Rule 2002     98 1100000    5.6   181  4.5  4.5 4.5  4.5
## 153 153         11:11 2004     95      NA    4.3   222 14.5 14.5 4.5 14.5
## 169 169   13 Dead Men 2003     85      NA    2.2    69 44.5  4.5 4.5  4.5
##       r5   r6   r7   r8  r9  r10 mpaa Action Animation Comedy Drama
## 7   24.5 24.5 14.5  4.5 4.5 14.5    R      1         0      0     1
## 43  14.5 24.5 14.5  4.5 4.5  4.5    R      0         0      0     1
## 123 14.5 14.5 24.5 14.5 4.5  4.5    R      0         0      1     0
## 124 14.5 24.5 14.5 14.5 4.5 14.5    R      0         0      1     0
## 153 14.5 14.5  4.5  4.5 4.5 14.5    R      0         0      0     0
## 169  4.5  4.5  4.5  4.5 0.0 14.5    R      1         0      0     0
##     Documentary Romance Short
## 7             0       0     0
## 43            0       0     0
## 123           0       1     0
## 124           0       0     0
## 153           0       0     0
## 169           0       0     0

head(subset(iris, Species=="versicolor") )

##    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 51          7.0         3.2          4.7         1.4 versicolor
## 52          6.4         3.2          4.5         1.5 versicolor
## 53          6.9         3.1          4.9         1.5 versicolor
## 54          5.5         2.3          4.0         1.3 versicolor
## 55          6.5         2.8          4.6         1.5 versicolor
## 56          5.7         2.8          4.5         1.3 versicolor

head(subset(iris, Sepal.Length > 7) )

##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 103          7.1         3.0          5.9         2.1 virginica
## 106          7.6         3.0          6.6         2.1 virginica
## 108          7.3         2.9          6.3         1.8 virginica
## 110          7.2         3.6          6.1         2.5 virginica
## 118          7.7         3.8          6.7         2.2 virginica
## 119          7.7         2.6          6.9         2.3 virginica

#Let's find the Murder Rate of the States in the highest Quartile
summary(USArrests)

##      Murder          Assault         UrbanPop          Rape      
##  Min.   : 0.800   Min.   : 45.0   Min.   :32.00   Min.   : 7.30  
##  1st Qu.: 4.075   1st Qu.:109.0   1st Qu.:54.50   1st Qu.:15.07  
##  Median : 7.250   Median :159.0   Median :66.00   Median :20.10  
##  Mean   : 7.788   Mean   :170.8   Mean   :65.54   Mean   :21.23  
##  3rd Qu.:11.250   3rd Qu.:249.0   3rd Qu.:77.75   3rd Qu.:26.18  
##  Max.   :17.400   Max.   :337.0   Max.   :91.00   Max.   :46.00

head(USArrests)

##            Murder Assault UrbanPop Rape
## Alabama      13.2     236       58 21.2
## Alaska       10.0     263       48 44.5
## Arizona       8.1     294       80 31.0
## Arkansas      8.8     190       50 19.5
## California    9.0     276       91 40.6
## Colorado      7.9     204       78 38.7

topQuartileMurderRate <- quantile(USArrests$Murder)[4]
#Now print the Arrests for these states with the highest Arrests for Murder
subset(USArrests, Murder > topQuartileMurderRate )

##                Murder Assault UrbanPop Rape
## Alabama          13.2     236       58 21.2
## Florida          15.4     335       80 31.9
## Georgia          17.4     211       60 25.8
## Louisiana        15.4     249       66 22.2
## Maryland         11.3     300       67 27.8
## Michigan         12.1     255       74 35.1
## Mississippi      16.1     259       44 17.1
## Nevada           12.2     252       81 46.0
## New Mexico       11.4     285       70 32.1
## North Carolina   13.0     337       45 16.1
## South Carolina   14.4     279       48 22.5
## Tennessee        13.2     188       59 26.9
## Texas            12.7     201       80 25.5

# USE OF Subset to get condition and only print out certain columns.
# We have to use the select argument to get the columns we want.
# If multiple columns are desired, use c(column1, column2, column3)
subset(USArrests, Murder > topQuartileMurderRate, select=UrbanPop )

##                UrbanPop
## Alabama              58
## Florida              80
## Georgia              60
## Louisiana            66
## Maryland             67
## Michigan             74
## Mississippi          44
## Nevada               81
## New Mexico           70
## North Carolina       45
## South Carolina       48
## Tennessee            59
## Texas                80

# TO DROP COLUMNS FROM DATA SETS USING just the column names subset()
smallUSArrests <- subset(USArrests, select=c(-UrbanPop, -Rape)) #Minus means we don't want those columns
head(smallUSArrests)

##            Murder Assault
## Alabama      13.2     236
## Alaska       10.0     263
## Arizona       8.1     294
## Arkansas      8.8     190
## California    9.0     276
## Colorado      7.9     204

Aggregating using TABLE

#Usage: table(vector)
table(iris$Species) #How many of each species are there?

## 
##     setosa versicolor  virginica 
##         50         50         50

head( subset(iris, Species == "setosa") ) # Show "setoda" only

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

head(mtcars)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

mtcars$gear

##  [1] 4 4 4 3 3 3 3 4 4 4 4 3 3 3 3 3 3 4 4 4 3 3 3 3 3 4 5 5 5 5 5 4
## Levels: 3 4 5

mtcars$cyl

##  [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4

table(mtcars$gear)

## 
##  3  4  5 
## 15 12  5

table(mtcars$cyl)

## 
##  4  6  8 
## 11  7 14

table(mtcars$gear, mtcars$cyl) # put it together to create a summary table

##    
##      4  6  8
##   3  1  2 12
##   4  8  4  0
##   5  2  1  2

names(movies)

##  [1] "X"           "title"       "year"        "length"      "budget"     
##  [6] "rating"      "votes"       "r1"          "r2"          "r3"         
## [11] "r4"          "r5"          "r6"          "r7"          "r8"         
## [16] "r9"          "r10"         "mpaa"        "Action"      "Animation"  
## [21] "Comedy"      "Drama"       "Documentary" "Romance"     "Short"

library(ggplot2)
# Table Using "with" -- notice that we don't need to use $
with(movies, table(year))

## year
## 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 
##    1    9    3   13    9    5    9   16   28    9   37   42   17   17   12 
## 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 
##   24   30   26   22   34   32   54   54   49   37   41   52   43   53   52 
## 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 
##   48   50   79   94   83  109  184  288  346  412  421  482  464  484  484 
## 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 
##  463  484  503  521  507  446  426  375  432  439  478  486  491  518  513 
## 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 
##  539  501  522  497  556  528  495  478  466  513  503  517  530  579  594 
## 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 
##  651  625  586  646  637  634  625  619  665  617  609  632  681  661  689 
## 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 
##  698  749  792  792  957  944  944  899  888  948 1016 1199 1248 1390 1568 
## 1998 1999 2000 2001 2002 2003 2004 2005 
## 1705 1927 2048 2121 2168 2158 1945  349

with(movies, table(length))

## length
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
##  169  116  243  185  279  726 1379  700  432  576  347  311  224  229  349 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30 
##  240  258  252  176  376  141  147  126   97  161  111  100  123  107  225 
##   31   32   33   34   35   36   37   38   39   40   41   42   43   44   45 
##   32   45   36   40   68   21   29   38   22  117   30   29   22   22   74 
##   46   47   48   49   50   51   52   53   54   55   56   57   58   59   60 
##   21   43   50   12  129   27   88   73  102  148  147  136  187  123  409 
##   61   62   63   64   65   66   67   68   69   70   71   72   73   74   75 
##  188  193  181  172  279  182  240  246  232  523  288  398  331  295  653 
##   76   77   78   79   80   81   82   83   84   85   86   87   88   89   90 
##  364  397  514  421 1149  567  803  768  826 1497  941 1152 1321 1134 3506 
##   91   92   93   94   95   96   97   98   99  100  101  102  103  104  105 
## 1304 1502 1395 1271 1900 1212 1199 1122  869 1711  700  853  752  719 1193 
##  106  107  108  109  110  111  112  113  114  115  116  117  118  119  120 
##  553  561  558  431  910  348  422  326  277  457  240  258  300  211  496 
##  121  122  123  124  125  126  127  128  129  130  131  132  133  134  135 
##  168  169  164  174  190  125  113  121   85  151   68   89   68   79  122 
##  136  137  138  139  140  141  142  143  144  145  146  147  148  149  150 
##   56   53   67   50  109   49   47   51   35   59   40   31   25   24   65 
##  151  152  153  154  155  156  157  158  159  160  161  162  163  164  165 
##   20   31   36   22   34   21   26   30   21   43   17   24   19   16   38 
##  166  167  168  169  170  171  172  173  174  175  176  177  178  179  180 
##   21   40   23   14   42   16   19   15   11   26   14   15   13   12   34 
##  181  182  183  184  185  186  187  188  189  190  191  192  193  194  195 
##    7    7    7    6    9   11   10   11    7    8    3    9    6    5    8 
##  196  197  198  199  200  201  202  204  205  206  207  208  209  210  211 
##    6    4    2    2   14    4    4    4    2    5    2    5    2    7    3 
##  212  213  214  215  216  217  218  219  220  221  222  223  224  225  226 
##    7    3    2    5    5    2    4    5    7    1    1    3    3    6    3 
##  227  228  229  230  231  232  233  234  235  236  237  238  239  240  241 
##    3    2    3    1    5    2    2    1    3    2    2    1    2   12    1 
##  242  243  244  245  246  247  248  250  251  252  254  255  256  259  260 
##    4    2    3    6    2    2    2    2    3    3    2    1    1    2    3 
##  261  263  264  265  267  269  270  275  277  278  279  283  285  288  293 
##    1    4    1    4    1    2    2    2    1    3    1    1    5    2    1 
##  294  298  299  300  301  302  306  311  312  315  316  320  321  328  358 
##    1    1    1    4    1    1    1    1    1    1    2    2    1    1    3 
##  360  384  390  399  402  407  410  417  418  480  485  501  555  566  647 
##    1    1    2    1    1    1    1    1    1    2    1    1    1    1    1 
##  773  873 1100 2880 5220 
##    1    1    1    1    1

with(movies, table(length>200))

## 
## FALSE  TRUE 
## 58538   250

X T A B S # C ROSS TABULATION

xtabs Uses the ‘formula syntax’

# Using Built-in Dataset called Titanic
xtabs(Freq ~ Class, data=Titanic)

## Class
##  1st  2nd  3rd Crew 
##  325  285  706  885

xtabs(Freq ~ Sex, data=Titanic)

## Sex
##   Male Female 
##   1731    470

xtabs(Freq ~ Survived, data=Titanic)

## Survived
##   No  Yes 
## 1490  711

#-- Let's make it slightly more complex. Bring in TWO VARIABLES.

#Notice the difference between the following two commands
xtabs(Freq ~ Survived+Sex, data=Titanic)

##         Sex
## Survived Male Female
##      No  1364    126
##      Yes  367    344

xtabs(Freq ~ Sex+Survived, data=Titanic)

##         Survived
## Sex        No  Yes
##   Male   1364  367
##   Female  126  344

#In the first "Survived" forms the rows. 
#In the second, "Survived" becomes the columns

AGGREGATE

# Aggregate WITHOUT USING THE FORMULA SYNTAX
head(airquality)

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA       0 14.3   56     5   5
## 6    28       0 14.9   66     5   6

aggregate(airquality, by=list(airquality$Month), FUN=mean)

##   Group.1 Ozone  Solar.R      Wind     Temp Month  Day
## 1       5    NA 157.9032 11.622581 65.54839     5 16.0
## 2       6    NA 190.1667 10.266667 79.10000     6 15.5
## 3       7    NA 216.4839  8.941935 83.90323     7 16.0
## 4       8    NA 155.2258  8.793548 83.96774     8 16.0
## 5       9    NA 167.4333 10.180000 76.90000     9 15.5

aggregate(airquality, by=list(airquality$Month), FUN=mean, na.rm=TRUE)

##   Group.1    Ozone  Solar.R      Wind     Temp Month  Day
## 1       5 23.61538 157.9032 11.622581 65.54839     5 16.0
## 2       6 29.44444 190.1667 10.266667 79.10000     6 15.5
## 3       7 59.11538 216.4839  8.941935 83.90323     7 16.0
## 4       8 59.96154 155.2258  8.793548 83.96774     8 16.0
## 5       9 31.44828 167.4333 10.180000 76.90000     9 15.5

head(aggregate(airquality, by=list(airquality$Month, airquality$Day), FUN=mean, na.rm=TRUE) )

##   Group.1 Group.2 Ozone Solar.R Wind Temp Month Day
## 1       5       1    41     190  7.4   67     5   1
## 2       6       1   NaN     286  8.6   78     6   1
## 3       7       1   135     269  4.1   84     7   1
## 4       8       1    39      83  6.9   81     8   1
## 5       9       1    96     167  6.9   91     9   1
## 6       5       2    36     118  8.0   72     5   2

#BONUS Example
aggregate(state.x77, list(Region = state.region), mean)

##          Region Population   Income Illiteracy Life Exp    Murder  HS Grad
## 1     Northeast   5495.111 4570.222   1.000000 71.26444  4.722222 53.96667
## 2         South   4208.125 4011.938   1.737500 69.70625 10.581250 44.34375
## 3 North Central   4803.000 4611.083   0.700000 71.76667  5.275000 54.51667
## 4          West   2915.308 4702.615   1.023077 71.23462  7.215385 62.00000
##      Frost      Area
## 1 132.7778  18141.00
## 2  64.6250  54605.12
## 3 138.8333  62652.00
## 4 102.1538 134463.00

#-----
# USING FORUMLA SYNTAX
#------
head(airquality)

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA       0 14.3   56     5   5
## 6    28       0 14.9   66     5   6

aggregate(data=airquality, Ozone~Month, mean)

##   Month    Ozone
## 1     5 23.61538
## 2     6 29.44444
## 3     7 59.11538
## 4     8 59.96154
## 5     9 31.44828

head(aggregate(data=airquality, Ozone~Month+Day, mean) )

##   Month Day Ozone
## 1     5   1    41
## 2     7   1   135
## 3     8   1    39
## 4     9   1    96
## 5     5   2    36
## 6     7   2    49

#use cbind if you want to aggregate more than one variable
head(aggregate(data=airquality, cbind(Ozone,Wind)~Month+Day, mean) )

##   Month Day Ozone Wind
## 1     5   1    41  7.4
## 2     7   1   135  4.1
## 3     8   1    39  6.9
## 4     9   1    96  6.9
## 5     5   2    36  8.0
## 6     7   2    49  9.2

# Bonus Example: IRIS DATA SET
aggregate(. ~Species, data=iris, FUN='length')

##      Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     setosa           50          50           50          50
## 2 versicolor           50          50           50          50
## 3  virginica           50          50           50          50

aggregate(Sepal.Length ~ Species, data=iris, FUN='length')

##      Species Sepal.Length
## 1     setosa           50
## 2 versicolor           50
## 3  virginica           50

head(aggregate(Species ~ Sepal.Length, data=iris, FUN='length') ) #this is NOT what we want

##   Sepal.Length Species
## 1          4.3       1
## 2          4.4       3
## 3          4.5       1
## 4          4.6       4
## 5          4.7       2
## 6          4.8       5

aggregate(Sepal.Length ~ Species, data=iris, FUN='mean')

##      Species Sepal.Length
## 1     setosa        5.006
## 2 versicolor        5.936
## 3  virginica        6.588

C U T

head(airquality)

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA       0 14.3   56     5   5
## 6    28       0 14.9   66     5   6

range(airquality$Temp)

## [1] 56 97

# [1] 56 97

# First let's cut this vector into 5 groups:
head(cut(airquality$Temp, 5) )

## [1] (64.2,72.4] (64.2,72.4] (72.4,80.6] (56,64.2]   (56,64.2]   (64.2,72.4]
## Levels: (56,64.2] (64.2,72.4] (72.4,80.6] (80.6,88.8] (88.8,97]

#---- Notice that R cut the range into 5 exactly equal intervals.

#If we use the labels=FALSE argument, we get an easier to read output
cut(airquality$Temp, 5, labels=FALSE)

##   [1] 2 2 3 1 1 2 2 1 1 2 3 2 2 2 1 1 2 1 2 1 1 3 1 1 1 1 1 2 4 3 3 3 3 2 4
##  [36] 4 3 4 4 5 4 5 5 4 3 3 3 2 2 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 5 5
##  [71] 5 4 3 4 5 3 4 4 4 4 4 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 4 4 4
## [106] 3 3 3 3 3 3 3 3 2 3 3 4 4 4 5 5 5 5 5 5 5 5 4 4 3 3 3 3 4 3 3 2 2 3 2
## [141] 3 2 4 1 2 4 2 1 2 3 3 3 2

#How many data points fall in each of the 5 intervals?
table(cut(airquality$Temp, 5))

## 
##   (56,64.2] (64.2,72.4] (72.4,80.6] (80.6,88.8]   (88.8,97] 
##          16          23          46          49          19

#-----Creating "NICE" Groups (Friendlier for human reading)
TempBreaks=seq(50,100, by=10)

TempBuckets <- cut(airquality$Temp, breaks=TempBreaks)
summary(TempBuckets)

##  (50,60]  (60,70]  (70,80]  (80,90] (90,100] 
##        8       25       52       54       14

#Let's create a NEW COLUMN in the data frame to store the bucket number
airquality$TempBuckets <- cut(airquality$Temp, TempBreaks, labels=FALSE)

#Let's cut the airquality Temp into 3 groups: High Medium and Low
high_medium_low <- cut(airquality$Temp, breaks=c(0, 70, 80, 100), labels=c("Low","Medium","High"))
summary(high_medium_low)

##    Low Medium   High 
##     33     52     68

A P P L Y Family of Functions

v <- c(40,2,83,28,58)
f <- factor(c("A","C","C","B","C"))
v

## [1] 40  2 83 28 58

## [1] A C C B C
## Levels: A B C

data(diamonds)
groups <- split(x, f)

## Warning in split.default(x, f): data length is not a multiple of split
## variable

groups

## $A
## [1] "8 23 37 38 20 40 39 41 21 33"
## 
## $B
## character(0)
## 
## $C
## character(0)

groups <- unstack(data.frame(x,f))

library(MASS)
split(Cars93$MPG.city, Cars93$Origin)$USA

##  [1] 22 19 16 19 16 16 25 25 19 21 18 15 17 17 20 23 20 29 23 22 17 21 18
## [24] 29 20 31 23 22 22 24 15 21 18 17 18 23 19 24 23 18 19 23 31 23 19 19
## [47] 19 28

head(mtcars)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

apply(mtcars, 1, max) # take each row of mtcars, and find its max value.

##           Mazda RX4       Mazda RX4 Wag          Datsun 710 
##                 "6"                 "6"                 "4" 
##      Hornet 4 Drive   Hornet Sportabout             Valiant 
##                 "6"                 "8"                 "6" 
##          Duster 360           Merc 240D            Merc 230 
##                 "8"                 "4"                 "4" 
##            Merc 280           Merc 280C          Merc 450SE 
##                 "6"                 "6"                 "8" 
##          Merc 450SL         Merc 450SLC  Cadillac Fleetwood 
##                 "8"                 "8"                 "8" 
## Lincoln Continental   Chrysler Imperial            Fiat 128 
##                 "8"                 "8"              "4.08" 
##         Honda Civic      Toyota Corolla       Toyota Corona 
##              "4.93"              "4.22"                 "4" 
##    Dodge Challenger         AMC Javelin          Camaro Z28 
##                 "8"                 "8"                 "8" 
##    Pontiac Firebird           Fiat X1-9       Porsche 914-2 
##                 "8"              "4.08"                 "5" 
##        Lotus Europa      Ford Pantera L        Ferrari Dino 
##                 "5"                 "8"                 "6" 
##       Maserati Bora          Volvo 142E 
##                 "8"              "4.11"

apply(iris[,1:4], 1, mean) #caution: This is just for illustration.

##   [1] 2.550 2.375 2.350 2.350 2.550 2.850 2.425 2.525 2.225 2.400 2.700
##  [12] 2.500 2.325 2.125 2.800 3.000 2.750 2.575 2.875 2.675 2.675 2.675
##  [23] 2.350 2.650 2.575 2.450 2.600 2.600 2.550 2.425 2.425 2.675 2.725
##  [34] 2.825 2.425 2.400 2.625 2.500 2.225 2.550 2.525 2.100 2.275 2.675
##  [45] 2.800 2.375 2.675 2.350 2.675 2.475 4.075 3.900 4.100 3.275 3.850
##  [56] 3.575 3.975 2.900 3.850 3.300 2.875 3.650 3.300 3.775 3.350 3.900
##  [67] 3.650 3.400 3.600 3.275 3.925 3.550 3.800 3.700 3.725 3.850 3.950
##  [78] 4.100 3.725 3.200 3.200 3.150 3.400 3.850 3.600 3.875 4.000 3.575
##  [89] 3.500 3.325 3.425 3.775 3.400 2.900 3.450 3.525 3.525 3.675 2.925
## [100] 3.475 4.525 3.875 4.525 4.150 4.375 4.825 3.400 4.575 4.200 4.850
## [111] 4.200 4.075 4.350 3.800 4.025 4.300 4.200 5.100 4.875 3.675 4.525
## [122] 3.825 4.800 3.925 4.450 4.550 3.900 3.950 4.225 4.400 4.550 5.025
## [133] 4.250 3.925 3.925 4.775 4.425 4.200 3.900 4.375 4.450 4.350 3.875
## [144] 4.550 4.550 4.300 3.925 4.175 4.325 3.950

#taking the mean of a bunch of different columns usually doesn't make mathematical sense.

#applying functions to columns
apply(mtcars, 2, summary)

##        mpg         cyl         disp        hp          drat       
## Length "32"        "32"        "32"        "32"        "32"       
## Class  "character" "character" "character" "character" "character"
## Mode   "character" "character" "character" "character" "character"
##        wt          qsec        vs          am          gear       
## Length "32"        "32"        "32"        "32"        "32"       
## Class  "character" "character" "character" "character" "character"
## Mode   "character" "character" "character" "character" "character"
##        carb       
## Length "32"       
## Class  "character"
## Mode   "character"

# L A P P L Y

lst <- list(1,"abc", 1.3, TRUE)
listClasses <- lapply(lst, class)
listClasses

## [[1]]
## [1] "numeric"
## 
## [[2]]
## [1] "character"
## 
## [[3]]
## [1] "numeric"
## 
## [[4]]
## [1] "logical"

#--- SAPPLY might be even better 
#--- understand the difference between the two commands:
lClasses <- lapply(lst, class)
sClasses <- sapply(lst,class)
str(lClasses)

## List of 4
##  $ : chr "numeric"
##  $ : chr "character"
##  $ : chr "numeric"
##  $ : chr "logical"

str(sClasses)

##  chr [1:4] "numeric" "character" "numeric" "logical"

lClasses <- lapply(mtcars, mean)

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

sClasses <- sapply(mtcars,mean)

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

str(lClasses)

## List of 11
##  $ mpg : num 20.1
##  $ cyl : num 6.19
##  $ disp: num 231
##  $ hp  : num 147
##  $ drat: num 3.6
##  $ wt  : num 3.22
##  $ qsec: num 17.8
##  $ vs  : num 0.438
##  $ am  : num 0.406
##  $ gear: num NA
##  $ carb: num 2.81

str(sClasses)

##  Named num [1:11] 20.09 6.19 230.72 146.69 3.6 ...
##  - attr(*, "names")= chr [1:11] "mpg" "cyl" "disp" "hp" ...

head(lapply(airquality$Ozone, round) )

## [[1]]
## [1] 41
## 
## [[2]]
## [1] 36
## 
## [[3]]
## [1] 12
## 
## [[4]]
## [1] 18
## 
## [[5]]
## [1] NA
## 
## [[6]]
## [1] 28

sapply(airquality$Ozone, round)

##   [1]  41  36  12  18  NA  28  23  19   8  NA   7  16  11  14  18  14  34
##  [18]   6  30  11   1  11   4  32  NA  NA  NA  23  45 115  37  NA  NA  NA
##  [35]  NA  NA  NA  29  NA  71  39  NA  NA  23  NA  NA  21  37  20  12  13
##  [52]  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA 135  49  32  NA  64  40  77
##  [69]  97  97  85  NA  10  27  NA   7  48  35  61  79  63  16  NA  NA  80
##  [86] 108  20  52  82  50  64  59  39   9  16  78  35  66 122  89 110  NA
## [103]  NA  44  28  65  NA  22  59  23  31  44  21   9  NA  45 168  73  NA
## [120]  76 118  84  85  96  78  73  91  47  32  20  23  21  24  44  21  28
## [137]   9  13  46  18  13  24  16  13  23  36   7  14  30  NA  14  18  20

sapply(airquality, max)

##       Ozone     Solar.R        Wind        Temp       Month         Day 
##          NA       334.0        20.7        97.0         9.0        31.0 
## TempBuckets 
##         5.0

head(airquality)

##   Ozone Solar.R Wind Temp Month Day TempBuckets
## 1    41     190  7.4   67     5   1           2
## 2    36     118  8.0   72     5   2           3
## 3    12     149 12.6   74     5   3           3
## 4    18     313 11.5   62     5   4           2
## 5    NA       0 14.3   56     5   5           1
## 6    28       0 14.9   66     5   6           2

T A P P L Y

#hts of 10 people 
heights <- c(177, 153, 133, 121, 164, 161, 127, 122, 180, 161, 131, 128)
groupIndex <- c("Male", "Woman", "Child", "Child","Male", "Woman", "Child", "Child","Male", "Woman", "Child", "Child")
#IMPORTANT: heights and index should be of the same length. 
# Each element of the vector should have an group identified in the Index vector.
tapply(heights, groupIndex, mean)

##    Child     Male    Woman 
## 127.0000 173.6667 158.3333

# Can not find the data now:
# gdp <- read.csv("~/data/countries_wide.csv") #replace this with the right path for your file
# gdp <- read.csv("data/countryCodes.csv") #replace this with the right path for your file
# str(gdp)
# tapply(gdp$POP, gdp$country.isocode, mean)
# tapply(gdp$POP, list(gdp$country.isocode,gdp$year), mean)

############
# D D P L Y 
##########
library(plyr)
#Let's use the in-built Titanic dataset to understand ddply
#what does ddply do?
# DDPLY: For each subset of a data frame, apply function then combine results into a data frame

str(Titanic)

##  table [1:4, 1:2, 1:2, 1:2] 0 0 35 0 0 0 17 0 118 154 ...
##  - attr(*, "dimnames")=List of 4
##   ..$ Class   : chr [1:4] "1st" "2nd" "3rd" "Crew"
##   ..$ Sex     : chr [1:2] "Male" "Female"
##   ..$ Age     : chr [1:2] "Child" "Adult"
##   ..$ Survived: chr [1:2] "No" "Yes"

titanic <- as.data.frame(Titanic)
head(titanic)

##   Class    Sex   Age Survived Freq
## 1   1st   Male Child       No    0
## 2   2nd   Male Child       No    0
## 3   3rd   Male Child       No   35
## 4  Crew   Male Child       No    0
## 5   1st Female Child       No    0
## 6   2nd Female Child       No    0

ddply(titanic, .(Class), summarize, Total=sum(Freq))

##   Class Total
## 1   1st   325
## 2   2nd   285
## 3   3rd   706
## 4  Crew   885

ddply(titanic, .(Class, Survived), summarize, Total=sum(Freq))

##   Class Survived Total
## 1   1st       No   122
## 2   1st      Yes   203
## 3   2nd       No   167
## 4   2nd      Yes   118
## 5   3rd       No   528
## 6   3rd      Yes   178
## 7  Crew       No   673
## 8  Crew      Yes   212

ddply(titanic, .(Sex, Survived), summarize, Total=sum(Freq))

##      Sex Survived Total
## 1   Male       No  1364
## 2   Male      Yes   367
## 3 Female       No   126
## 4 Female      Yes   344

ddply(titanic, .(Class, Sex, Survived), summarize, Total=sum(Freq))

##    Class    Sex Survived Total
## 1    1st   Male       No   118
## 2    1st   Male      Yes    62
## 3    1st Female       No     4
## 4    1st Female      Yes   141
## 5    2nd   Male       No   154
## 6    2nd   Male      Yes    25
## 7    2nd Female       No    13
## 8    2nd Female      Yes    93
## 9    3rd   Male       No   422
## 10   3rd   Male      Yes    88
## 11   3rd Female       No   106
## 12   3rd Female      Yes    90
## 13  Crew   Male       No   670
## 14  Crew   Male      Yes   192
## 15  Crew Female       No     3
## 16  Crew Female      Yes    20

str(titanic)

## 'data.frame':    32 obs. of  5 variables:
##  $ Class   : Factor w/ 4 levels "1st","2nd","3rd",..: 1 2 3 4 1 2 3 4 1 2 ...
##  $ Sex     : Factor w/ 2 levels "Male","Female": 1 1 1 1 2 2 2 2 1 1 ...
##  $ Age     : Factor w/ 2 levels "Child","Adult": 1 1 1 1 1 1 1 1 2 2 ...
##  $ Survived: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Freq    : num  0 0 35 0 0 0 17 0 118 154 ...

M E R G E

We want to fill in all missing hrs from 1 to 10, and fill in the value 0 for those.

x <- data.frame(hrs=c(3,4,7), values=1:3)
x

##   hrs values
## 1   3      1
## 2   4      2
## 3   7      3

allhours <- data.frame(hours=1:10) 
allhours

##    hours
## 1      1
## 2      2
## 3      3
## 4      4
## 5      5
## 6      6
## 7      7
## 8      8
## 9      9
## 10    10

merge(x, allhours, all=T, by.y="hours", by.x="hrs")

##    hrs values
## 1    1     NA
## 2    2     NA
## 3    3      1
## 4    4      2
## 5    5     NA
## 6    6     NA
## 7    7      3
## 8    8     NA
## 9    9     NA
## 10  10     NA

# These have error, I can't make them work now:
# df <- merge(x, allhours, all.y=TRUE, by=c("hrs"))
# df # convert the NA to 0
# df$values[is.na(df$values)] <- 0
# df

TIDY DATA: RESHAPING (ref. p8 of ITDA_Lecture+4a)

Wide Data: good for human to read; Long data: good for R

str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

dim(iris)

## [1] 150   5

names(iris)

## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width" 
## [5] "Species"

library(reshape2)

m.iris <- melt(iris)

## Using Species as id variables

names(m.iris)

## [1] "Species"  "variable" "value"

dim(m.iris)

## [1] 600   3

head(m.iris)

##   Species     variable value
## 1  setosa Sepal.Length   5.1
## 2  setosa Sepal.Length   4.9
## 3  setosa Sepal.Length   4.7
## 4  setosa Sepal.Length   4.6
## 5  setosa Sepal.Length   5.0
## 6  setosa Sepal.Length   5.4

dcast(m.iris, Species~variable)

## Aggregation function missing: defaulting to length

##      Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     setosa           50          50           50          50
## 2 versicolor           50          50           50          50
## 3  virginica           50          50           50          50

m.iris <- melt(data=iris, id.var="Species")
names(m.iris)

## [1] "Species"  "variable" "value"

head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

#m.iris
head(m.iris)

##   Species     variable value
## 1  setosa Sepal.Length   5.1
## 2  setosa Sepal.Length   4.9
## 3  setosa Sepal.Length   4.7
## 4  setosa Sepal.Length   4.6
## 5  setosa Sepal.Length   5.0
## 6  setosa Sepal.Length   5.4

#Once you melt it this way, no way to "remember" which rows came from where.

#need a row ID
iris$id <- row.names(iris)
irislong <- melt(iris, c("Species", "id"))
head(dcast(irislong, Species+id~variable) )

##   Species id Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1  setosa  1          5.1         3.5          1.4         0.2
## 2  setosa 10          4.9         3.1          1.5         0.1
## 3  setosa 11          5.4         3.7          1.5         0.2
## 4  setosa 12          4.8         3.4          1.6         0.2
## 5  setosa 13          4.8         3.0          1.4         0.1
## 6  setosa 14          4.3         3.0          1.1         0.1

# gdp
# melt(gdp)
# m.gdp <- melt(gdp, id.vars=c("country","country.isocode", "year"))   # Has error now
# names(m.gdp)
# dcast(m.gdp, country.isocode+year ~ variable)

## DATE AND TIME MANIPULATION
as.Date('2013-10-30')

## [1] "2013-10-30"

dte <- c("02/27/13")
as.Date(dte, "%m/%d/%y")

## [1] "2013-02-27"

# You can use ISOdate to combine elements and then convert to a date type
# ISOdate(year, month, day) and then as.Date()
as.Date(ISOdate(2013,1,1))

## [1] "2013-01-01"

#The format() argument inside date
#format(Sys.Date(), format=Ã¢ÂÂ%m/%d/%YÃ¢ÂÂ
#       Extracting Parts of a Date
d <- as.Date("2013-10-15")
p <- as.POSIXlt(d)
# Get parts of p
p$mon; p$year+1900

## [1] 9

## [1] 2013

s <- as.Date("2013-10-01")
e <- as.Date("2013-11-01")
#Creating a Sequence of Dates
seq(from=s, to=e, by=1)

##  [1] "2013-10-01" "2013-10-02" "2013-10-03" "2013-10-04" "2013-10-05"
##  [6] "2013-10-06" "2013-10-07" "2013-10-08" "2013-10-09" "2013-10-10"
## [11] "2013-10-11" "2013-10-12" "2013-10-13" "2013-10-14" "2013-10-15"
## [16] "2013-10-16" "2013-10-17" "2013-10-18" "2013-10-19" "2013-10-20"
## [21] "2013-10-21" "2013-10-22" "2013-10-23" "2013-10-24" "2013-10-25"
## [26] "2013-10-26" "2013-10-27" "2013-10-28" "2013-10-29" "2013-10-30"
## [31] "2013-10-31" "2013-11-01"

seq(from=s, by='year', length.out=3)

## [1] "2013-10-01" "2014-10-01" "2015-10-01"

DATA MANIPULATION WITH DPLYR

# SIMPLE INTRODUCTION TO PIPES
round(sqrt(1000), 3) #usual R way of doing things

## [1] 31.623

library(magrittr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

1000 %>% sqrt %>% round()

## [1] 32

1000 %>% sqrt %>% round(.,3)

## [1] 31.623

####
# DPLYR BASICS
#####

#TBL_DF & GLIMPSE()
#pretty print a data frame
library(ggplot2)

pretty_movies <- tbl_df(movies)
head(movies)

##   X                    title year length budget rating votes   r1   r2  r3
## 1 1                        $ 1971    121     NA    6.4   348  4.5  4.5 4.5
## 2 2        $1000 a Touchdown 1939     71     NA    6.0    20  0.0 14.5 4.5
## 3 3   $21 a Day Once a Month 1941      7     NA    8.2     5  0.0  0.0 0.0
## 4 4                  $40,000 1996     70     NA    8.2     6 14.5  0.0 0.0
## 5 5 $50,000 Climax Show, The 1975     71     NA    3.4    17 24.5  4.5 0.0
## 6 6                    $pent 2000     91     NA    4.3    45  4.5  4.5 4.5
##     r4   r5   r6   r7   r8   r9  r10 mpaa Action Animation Comedy Drama
## 1  4.5 14.5 24.5 24.5 14.5  4.5  4.5           0         0      1     1
## 2 24.5 14.5 14.5 14.5  4.5  4.5 14.5           0         0      1     0
## 3  0.0  0.0 24.5  0.0 44.5 24.5 24.5           0         1      0     0
## 4  0.0  0.0  0.0  0.0  0.0 34.5 45.5           0         0      1     0
## 5 14.5 14.5  4.5  0.0  0.0  0.0 24.5           0         0      0     0
## 6 14.5 14.5 14.5  4.5  4.5 14.5 14.5           0         0      0     1
##   Documentary Romance Short
## 1           0       0     0
## 2           0       0     0
## 3           0       0     1
## 4           0       0     0
## 5           0       0     0
## 6           0       0     0

pretty_movies #notice that is prints nicely

## # A tibble: 58,788 × 25
##        X                    title  year length budget rating votes    r1
##    <int>                    <chr> <int>  <int>  <int>  <dbl> <int> <dbl>
## 1      1                        $  1971    121     NA    6.4   348   4.5
## 2      2        $1000 a Touchdown  1939     71     NA    6.0    20   0.0
## 3      3   $21 a Day Once a Month  1941      7     NA    8.2     5   0.0
## 4      4                  $40,000  1996     70     NA    8.2     6  14.5
## 5      5 $50,000 Climax Show, The  1975     71     NA    3.4    17  24.5
## 6      6                    $pent  2000     91     NA    4.3    45   4.5
## 7      7                  $windle  2002     93     NA    5.3   200   4.5
## 8      8                     '15'  2002     25     NA    6.7    24   4.5
## 9      9                      '38  1987     97     NA    6.6    18   4.5
## 10    10                  '49-'17  1917     61     NA    6.0    51   4.5
## # ... with 58,778 more rows, and 17 more variables: r2 <dbl>, r3 <dbl>,
## #   r4 <dbl>, r5 <dbl>, r6 <dbl>, r7 <dbl>, r8 <dbl>, r9 <dbl>, r10 <dbl>,
## #   mpaa <chr>, Action <int>, Animation <int>, Comedy <int>, Drama <int>,
## #   Documentary <int>, Romance <int>, Short <int>

glimpse(movies) #easier to read than str()

## Observations: 58,788
## Variables: 25
## $ X           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ title       <chr> "$", "$1000 a Touchdown", "$21 a Day Once a Month"...
## $ year        <int> 1971, 1939, 1941, 1996, 1975, 2000, 2002, 2002, 19...
## $ length      <int> 121, 71, 7, 70, 71, 91, 93, 25, 97, 61, 99, 96, 10...
## $ budget      <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ rating      <dbl> 6.4, 6.0, 8.2, 8.2, 3.4, 4.3, 5.3, 6.7, 6.6, 6.0, ...
## $ votes       <int> 348, 20, 5, 6, 17, 45, 200, 24, 18, 51, 23, 53, 44...
## $ r1          <dbl> 4.5, 0.0, 0.0, 14.5, 24.5, 4.5, 4.5, 4.5, 4.5, 4.5...
## $ r2          <dbl> 4.5, 14.5, 0.0, 0.0, 4.5, 4.5, 0.0, 4.5, 4.5, 0.0,...
## $ r3          <dbl> 4.5, 4.5, 0.0, 0.0, 0.0, 4.5, 4.5, 4.5, 4.5, 4.5, ...
## $ r4          <dbl> 4.5, 24.5, 0.0, 0.0, 14.5, 14.5, 4.5, 4.5, 0.0, 4....
## $ r5          <dbl> 14.5, 14.5, 0.0, 0.0, 14.5, 14.5, 24.5, 4.5, 0.0, ...
## $ r6          <dbl> 24.5, 14.5, 24.5, 0.0, 4.5, 14.5, 24.5, 14.5, 0.0,...
## $ r7          <dbl> 24.5, 14.5, 0.0, 0.0, 0.0, 4.5, 14.5, 14.5, 34.5, ...
## $ r8          <dbl> 14.5, 4.5, 44.5, 0.0, 0.0, 4.5, 4.5, 14.5, 14.5, 4...
## $ r9          <dbl> 4.5, 4.5, 24.5, 34.5, 0.0, 14.5, 4.5, 4.5, 4.5, 4....
## $ r10         <dbl> 4.5, 14.5, 24.5, 45.5, 24.5, 14.5, 14.5, 14.5, 24....
## $ mpaa        <chr> "", "", "", "", "", "", "R", "", "", "", "", "", "...
## $ Action      <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,...
## $ Animation   <int> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Comedy      <int> 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,...
## $ Drama       <int> 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,...
## $ Documentary <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Romance     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Short       <int> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,...

FILTER

movies_with_budgets <- filter(movies, !is.na(budget))
head(filter(movies, Documentary==1) )

##     X                                            title year length budget
## 1   8                                             '15' 2002     25     NA
## 2  18 'Bullitt': Steve McQueen's Commitment to Reality 1968     10     NA
## 3  50                 'What's Your 'I.Q.'?' Number Two 1940      9     NA
## 4 101                                     1 Giant Leap 2002    155     NA
## 5 104        1,99 - Um Supermercado Que Vende Palavras 2003     72     NA
## 6 107                                       1/2 Mensch 1986     48     NA
##   rating votes  r1  r2   r3   r4   r5   r6   r7   r8   r9  r10 mpaa Action
## 1    6.7    24 4.5 4.5  4.5  4.5  4.5 14.5 14.5 14.5  4.5 14.5           0
## 2    6.6    37 0.0 0.0  4.5  4.5  4.5 14.5 44.5 14.5  4.5  4.5           0
## 3    5.9     9 0.0 0.0 14.5 14.5 24.5 24.5 14.5  0.0 14.5 14.5           0
## 4    6.6    75 4.5 4.5  0.0  4.5  4.5 14.5 24.5 14.5 14.5 34.5           0
## 5    4.9    28 4.5 4.5  4.5  4.5 14.5  4.5 14.5 24.5  4.5 14.5           0
## 6    8.1    29 4.5 0.0  0.0  0.0  4.5  0.0  0.0  4.5 24.5 64.5           0
##   Animation Comedy Drama Documentary Romance Short
## 1         0      0     0           1       0     1
## 2         0      0     0           1       0     1
## 3         0      0     0           1       0     1
## 4         0      0     0           1       0     0
## 5         0      1     0           1       0     0
## 6         0      0     0           1       0     0

filter(movies, Documentary==1) %>% nrow()

## [1] 3472

good_comedies <- filter(movies, rating > 9, Comedy==1) 
dim(good_comedies) #171 movies

## [1] 171  25

# Let us say we only want highly rated comdies, 
# which a lot of people have watched,
# made after year 2000.
head(movies %>%  filter(rating >8, Comedy==1, votes > 100, year > 2000) )

##       X                                 title year length   budget rating
## 1  5535                              Big Fish 2003    125 70000000    8.1
## 2  7104                 Bowling for Columbine 2002    120  4000000    8.5
## 3 10096                     Cielo abierto, El 2001    110       NA    8.2
## 4 15356                           Dzien swira 2002     93       NA    8.3
## 5 15612                             Eierdiebe 2003     87       NA    8.1
## 6 16424 Eternal Sunshine of the Spotless Mind 2004    108 20000000    8.6
##   votes  r1  r2  r3  r4  r5  r6   r7   r8   r9  r10  mpaa Action Animation
## 1 31525 4.5 4.5 4.5 4.5 4.5 4.5 14.5 24.5 24.5 24.5 PG-13      0         0
## 2 36747 4.5 4.5 4.5 4.5 4.5 4.5  4.5 14.5 24.5 34.5     R      0         0
## 3   191 4.5 4.5 4.5 4.5 4.5 4.5 14.5 24.5 24.5 24.5            0         0
## 4   349 4.5 4.5 4.5 4.5 4.5 4.5  4.5 14.5 24.5 44.5            0         0
## 5   130 4.5 4.5 0.0 4.5 4.5 4.5 14.5 14.5 14.5 34.5            0         0
## 6 46240 4.5 4.5 4.5 4.5 4.5 4.5  4.5 14.5 24.5 44.5     R      0         0
##   Comedy Drama Documentary Romance Short
## 1      1     1           0       0     0
## 2      1     1           1       0     0
## 3      1     0           0       1     0
## 4      1     1           0       0     0
## 5      1     1           0       1     0
## 6      1     1           0       1     0

head(glimpse(good_comedies) )

## Observations: 171
## Variables: 25
## $ X           <int> 59, 718, 2086, 2717, 2822, 3173, 3491, 3935, 3995,...
## $ title       <chr> "+1 -1", "Abduction of Figaro", "America 101", "An...
## $ year        <int> 1987, 1984, 2005, 1919, 1941, 2004, 1962, 1962, 20...
## $ length      <int> 7, 144, 86, 60, 80, 109, 83, 61, 9, 85, 91, 90, 15...
## $ budget      <int> NA, NA, NA, NA, NA, 3200000, NA, NA, NA, NA, NA, 5...
## $ rating      <dbl> 9.4, 9.1, 9.5, 9.1, 9.4, 9.2, 9.1, 9.1, 9.3, 9.2, ...
## $ votes       <int> 6, 22, 31, 19, 5, 23, 10, 8, 6, 8, 19, 11, 13, 6, ...
## $ r1          <dbl> 0.0, 0.0, 4.5, 14.5, 0.0, 4.5, 0.0, 0.0, 0.0, 0.0,...
## $ r2          <dbl> 0.0, 0.0, 4.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
## $ r3          <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 14.5, 0.0, 0.0,...
## $ r4          <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
## $ r5          <dbl> 0.0, 0.0, 0.0, 4.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
## $ r6          <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
## $ r7          <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 4.5, 24.5, 0.0, 14.5, 14....
## $ r8          <dbl> 0.0, 14.5, 4.5, 0.0, 24.5, 4.5, 0.0, 0.0, 0.0, 0.0...
## $ r9          <dbl> 45.5, 4.5, 4.5, 14.5, 24.5, 24.5, 14.5, 0.0, 14.5,...
## $ r10         <dbl> 45.5, 84.5, 74.5, 64.5, 64.5, 64.5, 74.5, 84.5, 64...
## $ mpaa        <chr> "", "", "", "", "", "", "", "", "", "", "PG-13", "...
## $ Action      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Animation   <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Comedy      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ Drama       <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,...
## $ Documentary <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Romance     <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ Short       <int> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1,...

##      X                     title year length  budget rating votes   r1  r2
## 1   59                     +1 -1 1987      7      NA    9.4     6  0.0 0.0
## 2  718       Abduction of Figaro 1984    144      NA    9.1    22  0.0 0.0
## 3 2086               America 101 2005     86      NA    9.5    31  4.5 4.5
## 4 2717      Anne of Green Gables 1919     60      NA    9.1    19 14.5 0.0
## 5 2822 Anton Ivanovich serditsya 1941     80      NA    9.4     5  0.0 0.0
## 6 3173               Arrangement 2004    109 3200000    9.2    23  4.5 0.0
##   r3 r4  r5 r6  r7   r8   r9  r10 mpaa Action Animation Comedy Drama
## 1  0  0 0.0  0 0.0  0.0 45.5 45.5           0         1      1     0
## 2  0  0 0.0  0 0.0 14.5  4.5 84.5           0         0      1     0
## 3  0  0 0.0  0 0.0  4.5  4.5 74.5           0         0      1     0
## 4  0  0 4.5  0 0.0  0.0 14.5 64.5           0         0      1     1
## 5  0  0 0.0  0 0.0 24.5 24.5 64.5           0         0      1     0
## 6  0  0 0.0  0 4.5  4.5 24.5 64.5           0         0      1     0
##   Documentary Romance Short
## 1           0       0     1
## 2           0       0     0
## 3           0       0     0
## 4           0       0     0
## 5           0       0     0
## 6           0       1     0

head(good_comedies$title)

## [1] "+1 -1"                     "Abduction of Figaro"      
## [3] "America 101"               "Anne of Green Gables"     
## [5] "Anton Ivanovich serditsya" "Arrangement"

S E L E C T

movies_df <- tbl_df(movies)
select(movies_df, title, year, rating) #Just the columns we want to see

## # A tibble: 58,788 × 3
##                       title  year rating
##                       <chr> <int>  <dbl>
## 1                         $  1971    6.4
## 2         $1000 a Touchdown  1939    6.0
## 3    $21 a Day Once a Month  1941    8.2
## 4                   $40,000  1996    8.2
## 5  $50,000 Climax Show, The  1975    3.4
## 6                     $pent  2000    4.3
## 7                   $windle  2002    5.3
## 8                      '15'  2002    6.7
## 9                       '38  1987    6.6
## 10                  '49-'17  1917    6.0
## # ... with 58,778 more rows

select(movies_df, -c(r1:r10)) #we don't want certain columns

## # A tibble: 58,788 × 15
##        X                    title  year length budget rating votes  mpaa
##    <int>                    <chr> <int>  <int>  <int>  <dbl> <int> <chr>
## 1      1                        $  1971    121     NA    6.4   348      
## 2      2        $1000 a Touchdown  1939     71     NA    6.0    20      
## 3      3   $21 a Day Once a Month  1941      7     NA    8.2     5      
## 4      4                  $40,000  1996     70     NA    8.2     6      
## 5      5 $50,000 Climax Show, The  1975     71     NA    3.4    17      
## 6      6                    $pent  2000     91     NA    4.3    45      
## 7      7                  $windle  2002     93     NA    5.3   200     R
## 8      8                     '15'  2002     25     NA    6.7    24      
## 9      9                      '38  1987     97     NA    6.6    18      
## 10    10                  '49-'17  1917     61     NA    6.0    51      
## # ... with 58,778 more rows, and 7 more variables: Action <int>,
## #   Animation <int>, Comedy <int>, Drama <int>, Documentary <int>,
## #   Romance <int>, Short <int>

select(movies_df, -c(budget, r1:r10, Animation, Documentary, Short, Romance)) #we don't want certain columns

## # A tibble: 58,788 × 10
##        X                    title  year length rating votes  mpaa Action
##    <int>                    <chr> <int>  <int>  <dbl> <int> <chr>  <int>
## 1      1                        $  1971    121    6.4   348            0
## 2      2        $1000 a Touchdown  1939     71    6.0    20            0
## 3      3   $21 a Day Once a Month  1941      7    8.2     5            0
## 4      4                  $40,000  1996     70    8.2     6            0
## 5      5 $50,000 Climax Show, The  1975     71    3.4    17            0
## 6      6                    $pent  2000     91    4.3    45            0
## 7      7                  $windle  2002     93    5.3   200     R      1
## 8      8                     '15'  2002     25    6.7    24            0
## 9      9                      '38  1987     97    6.6    18            0
## 10    10                  '49-'17  1917     61    6.0    51            0
## # ... with 58,778 more rows, and 2 more variables: Comedy <int>,
## #   Drama <int>

#more examples of Select
head(select(mtcars, wt, mpg) )

##                      wt  mpg
## Mazda RX4         2.620 21.0
## Mazda RX4 Wag     2.875 21.0
## Datsun 710        2.320 22.8
## Hornet 4 Drive    3.215 21.4
## Hornet Sportabout 3.440 18.7
## Valiant           3.460 18.1

head(select(iris, Species, Sepal.Length) )

##   Species Sepal.Length
## 1  setosa          5.1
## 2  setosa          4.9
## 3  setosa          4.7
## 4  setosa          4.6
## 5  setosa          5.0
## 6  setosa          5.4

head(select(iris, -Species) )

##   Sepal.Length Sepal.Width Petal.Length Petal.Width id
## 1          5.1         3.5          1.4         0.2  1
## 2          4.9         3.0          1.4         0.2  2
## 3          4.7         3.2          1.3         0.2  3
## 4          4.6         3.1          1.5         0.2  4
## 5          5.0         3.6          1.4         0.2  5
## 6          5.4         3.9          1.7         0.4  6

head( select(iris, -c(Species, Petal.Length)) )

##   Sepal.Length Sepal.Width Petal.Width id
## 1          5.1         3.5         0.2  1
## 2          4.9         3.0         0.2  2
## 3          4.7         3.2         0.2  3
## 4          4.6         3.1         0.2  4
## 5          5.0         3.6         0.2  5
## 6          5.4         3.9         0.4  6

#-----------------------------------------------
# MULTIPLE WAYS TO SELECT THE COLUMNS WE DESIRE
#-----------------------------------------------
# SELECT has many USEFUL FEATURES...

# Many Different way to select the columns we are interested in:
select(movies_df, title, year, budget, length) # Variables of interest

## # A tibble: 58,788 × 4
##                       title  year budget length
##                       <chr> <int>  <int>  <int>
## 1                         $  1971     NA    121
## 2         $1000 a Touchdown  1939     NA     71
## 3    $21 a Day Once a Month  1941     NA      7
## 4                   $40,000  1996     NA     70
## 5  $50,000 Climax Show, The  1975     NA     71
## 6                     $pent  2000     NA     91
## 7                   $windle  2002     NA     93
## 8                      '15'  2002     NA     25
## 9                       '38  1987     NA     97
## 10                  '49-'17  1917     NA     61
## # ... with 58,778 more rows

# You can also select a range of columns from start:end
select(movies_df, title:votes)   # All the columns from title to votes get selected

## # A tibble: 58,788 × 6
##                       title  year length budget rating votes
##                       <chr> <int>  <int>  <int>  <dbl> <int>
## 1                         $  1971    121     NA    6.4   348
## 2         $1000 a Touchdown  1939     71     NA    6.0    20
## 3    $21 a Day Once a Month  1941      7     NA    8.2     5
## 4                   $40,000  1996     70     NA    8.2     6
## 5  $50,000 Climax Show, The  1975     71     NA    3.4    17
## 6                     $pent  2000     91     NA    4.3    45
## 7                   $windle  2002     93     NA    5.3   200
## 8                      '15'  2002     25     NA    6.7    24
## 9                       '38  1987     97     NA    6.6    18
## 10                  '49-'17  1917     61     NA    6.0    51
## # ... with 58,778 more rows

# Now, let's say that we want only the Columns that contain movie ratings...
select(movies_df, contains("r"))  # Any column that contains 'r' in its name

## # A tibble: 58,788 × 16
##     year rating    r1    r2    r3    r4    r5    r6    r7    r8    r9
##    <int>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1   1971    6.4   4.5   4.5   4.5   4.5  14.5  24.5  24.5  14.5   4.5
## 2   1939    6.0   0.0  14.5   4.5  24.5  14.5  14.5  14.5   4.5   4.5
## 3   1941    8.2   0.0   0.0   0.0   0.0   0.0  24.5   0.0  44.5  24.5
## 4   1996    8.2  14.5   0.0   0.0   0.0   0.0   0.0   0.0   0.0  34.5
## 5   1975    3.4  24.5   4.5   0.0  14.5  14.5   4.5   0.0   0.0   0.0
## 6   2000    4.3   4.5   4.5   4.5  14.5  14.5  14.5   4.5   4.5  14.5
## 7   2002    5.3   4.5   0.0   4.5   4.5  24.5  24.5  14.5   4.5   4.5
## 8   2002    6.7   4.5   4.5   4.5   4.5   4.5  14.5  14.5  14.5   4.5
## 9   1987    6.6   4.5   4.5   4.5   0.0   0.0   0.0  34.5  14.5   4.5
## 10  1917    6.0   4.5   0.0   4.5   4.5   4.5  44.5  14.5   4.5   4.5
## # ... with 58,778 more rows, and 5 more variables: r10 <dbl>, Drama <int>,
## #   Documentary <int>, Romance <int>, Short <int>

select(movies_df, ends_with("t")) # All vars ending with "test".

## # A tibble: 58,788 × 2
##    budget Short
##     <int> <int>
## 1      NA     0
## 2      NA     0
## 3      NA     1
## 4      NA     0
## 5      NA     0
## 6      NA     0
## 7      NA     0
## 8      NA     1
## 9      NA     0
## 10     NA     0
## # ... with 58,778 more rows

select(movies_df, starts_with("r"))  # Gets all vars staring with "R".

## # A tibble: 58,788 × 12
##    rating    r1    r2    r3    r4    r5    r6    r7    r8    r9   r10
##     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     6.4   4.5   4.5   4.5   4.5  14.5  24.5  24.5  14.5   4.5   4.5
## 2     6.0   0.0  14.5   4.5  24.5  14.5  14.5  14.5   4.5   4.5  14.5
## 3     8.2   0.0   0.0   0.0   0.0   0.0  24.5   0.0  44.5  24.5  24.5
## 4     8.2  14.5   0.0   0.0   0.0   0.0   0.0   0.0   0.0  34.5  45.5
## 5     3.4  24.5   4.5   0.0  14.5  14.5   4.5   0.0   0.0   0.0  24.5
## 6     4.3   4.5   4.5   4.5  14.5  14.5  14.5   4.5   4.5  14.5  14.5
## 7     5.3   4.5   0.0   4.5   4.5  24.5  24.5  14.5   4.5   4.5  14.5
## 8     6.7   4.5   4.5   4.5   4.5   4.5  14.5  14.5  14.5   4.5  14.5
## 9     6.6   4.5   4.5   4.5   0.0   0.0   0.0  34.5  14.5   4.5  24.5
## 10    6.0   4.5   0.0   4.5   4.5   4.5  44.5  14.5   4.5   4.5   4.5
## # ... with 58,778 more rows, and 1 more variables: Romance <int>

# The above is not quite what we want. We don't want the Romance column

select(movies_df, matches("r[0-9]"))  # Columns that match a regex.

## # A tibble: 58,788 × 10
##       r1    r2    r3    r4    r5    r6    r7    r8    r9   r10
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1    4.5   4.5   4.5   4.5  14.5  24.5  24.5  14.5   4.5   4.5
## 2    0.0  14.5   4.5  24.5  14.5  14.5  14.5   4.5   4.5  14.5
## 3    0.0   0.0   0.0   0.0   0.0  24.5   0.0  44.5  24.5  24.5
## 4   14.5   0.0   0.0   0.0   0.0   0.0   0.0   0.0  34.5  45.5
## 5   24.5   4.5   0.0  14.5  14.5   4.5   0.0   0.0   0.0  24.5
## 6    4.5   4.5   4.5  14.5  14.5  14.5   4.5   4.5  14.5  14.5
## 7    4.5   0.0   4.5   4.5  24.5  24.5  14.5   4.5   4.5  14.5
## 8    4.5   4.5   4.5   4.5   4.5  14.5  14.5  14.5   4.5  14.5
## 9    4.5   4.5   4.5   0.0   0.0   0.0  34.5  14.5   4.5  24.5
## 10   4.5   0.0   4.5   4.5   4.5  44.5  14.5   4.5   4.5   4.5
## # ... with 58,778 more rows

select(movies_df, num_range("r", 1:10)) # q1 thru q4 regardless of location.

## # A tibble: 58,788 × 10
##       r1    r2    r3    r4    r5    r6    r7    r8    r9   r10
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1    4.5   4.5   4.5   4.5  14.5  24.5  24.5  14.5   4.5   4.5
## 2    0.0  14.5   4.5  24.5  14.5  14.5  14.5   4.5   4.5  14.5
## 3    0.0   0.0   0.0   0.0   0.0  24.5   0.0  44.5  24.5  24.5
## 4   14.5   0.0   0.0   0.0   0.0   0.0   0.0   0.0  34.5  45.5
## 5   24.5   4.5   0.0  14.5  14.5   4.5   0.0   0.0   0.0  24.5
## 6    4.5   4.5   4.5  14.5  14.5  14.5   4.5   4.5  14.5  14.5
## 7    4.5   0.0   4.5   4.5  24.5  24.5  14.5   4.5   4.5  14.5
## 8    4.5   4.5   4.5   4.5   4.5  14.5  14.5  14.5   4.5  14.5
## 9    4.5   4.5   4.5   0.0   0.0   0.0  34.5  14.5   4.5  24.5
## 10   4.5   0.0   4.5   4.5   4.5  44.5  14.5   4.5   4.5   4.5
## # ... with 58,778 more rows

# COMBINING FILTER WITH SELECT
# How to list movies that have a rating of GT 9.2 and GT 10000 votes?
movies %>%
  filter(rating>=9.1 ,  votes > 1000) %>%
  select(title, votes, rating)

##                       title  votes rating
## 1            Godfather, The 122755    9.1
## 2 Shawshank Redemption, The 149494    9.1

ARRANGE - REORDER (SORT) ROWS

movies_df <- tbl_df(movies)
arrange(movies_df, rating) #but this is not what we want

## # A tibble: 58,788 × 25
##        X                    title  year length budget rating votes    r1
##    <int>                    <chr> <int>  <int>  <int>  <dbl> <int> <dbl>
## 1    274             20/20 Vision  1999     20     NA      1     5  64.5
## 2   2139     American Flatulators  1995      3     NA      1     5 100.0
## 3   3851                  Babo 73  1964     57     NA      1     7  84.5
## 4   4142              Bak syv hav  1991     87     NA      1    85  84.5
## 5   4231              Balls Bluff  1961     24     NA      1    11  84.5
## 6   4994 Bell, Bare and Beautiful  1963     64     NA      1    20  64.5
## 7   5407 Beyond the Wall of Sleep  2004     84     NA      1    16  45.5
## 8   5963               Black Love  1972     70     NA      1     8  74.5
## 9   6013 Black Sherlock Holmes, A  1918     12     NA      1     6 100.0
## 10  6096  Blair Fish Project, The  1999     10     NA      1    14  74.5
## # ... with 58,778 more rows, and 17 more variables: r2 <dbl>, r3 <dbl>,
## #   r4 <dbl>, r5 <dbl>, r6 <dbl>, r7 <dbl>, r8 <dbl>, r9 <dbl>, r10 <dbl>,
## #   mpaa <chr>, Action <int>, Animation <int>, Comedy <int>, Drama <int>,
## #   Documentary <int>, Romance <int>, Short <int>

arrange(movies_df, desc(rating))

## # A tibble: 58,788 × 25
##        X                                   title  year length budget
##    <int>                                   <chr> <int>  <int>  <int>
## 1  13908 Dimensia Minds Trilogy: The Hope Factor  2004     10     NA
## 2  18016                        Fishing for Love  2001      7     NA
## 3  49846                        Summer Sonata, A  2004     30   2000
## 4   5898                            Black Canyon  2004     25   3000
## 5   7711                           Buck Privates  1928     70     NA
## 6  13171                                Defilada  1989     60     NA
## 7  13909        Dimensia Minds Trilogy: The Reds  2004      8   1200
## 8  15019                                Drifting  2004     21   3000
## 9  15659                              Ekdromi, I  1966     85     NA
## 10 19826              Genet parle d'Angela Davis  1970     10     NA
## # ... with 58,778 more rows, and 20 more variables: rating <dbl>,
## #   votes <int>, r1 <dbl>, r2 <dbl>, r3 <dbl>, r4 <dbl>, r5 <dbl>,
## #   r6 <dbl>, r7 <dbl>, r8 <dbl>, r9 <dbl>, r10 <dbl>, mpaa <chr>,
## #   Action <int>, Animation <int>, Comedy <int>, Drama <int>,
## #   Documentary <int>, Romance <int>, Short <int>

arrange(movies_df, desc(rating), desc(year))

## # A tibble: 58,788 × 25
##        X                                   title  year length budget
##    <int>                                   <chr> <int>  <int>  <int>
## 1  13908 Dimensia Minds Trilogy: The Hope Factor  2004     10     NA
## 2  49846                        Summer Sonata, A  2004     30   2000
## 3  18016                        Fishing for Love  2001      7     NA
## 4  27510                      Keeper of the Past  2005     18  30000
## 5   5898                            Black Canyon  2004     25   3000
## 6  13909        Dimensia Minds Trilogy: The Reds  2004      8   1200
## 7  15019                                Drifting  2004     21   3000
## 8  37399                                  Of Age  2004      5   1000
## 9  39498                              Penis Envy  2004      8     NA
## 10 40458                 Plight of Clownana, The  2004     15   6000
## # ... with 58,778 more rows, and 20 more variables: rating <dbl>,
## #   votes <int>, r1 <dbl>, r2 <dbl>, r3 <dbl>, r4 <dbl>, r5 <dbl>,
## #   r6 <dbl>, r7 <dbl>, r8 <dbl>, r9 <dbl>, r10 <dbl>, mpaa <chr>,
## #   Action <int>, Animation <int>, Comedy <int>, Drama <int>,
## #   Documentary <int>, Romance <int>, Short <int>

# Notice the difference, when the year and rating are switched...
arrange(movies_df, desc(year), desc(rating))

## # A tibble: 58,788 × 25
##        X                  title  year length budget rating votes    r1
##    <int>                  <chr> <int>  <int>  <int>  <dbl> <int> <dbl>
## 1  27510     Keeper of the Past  2005     18  30000    9.9     7     0
## 2  20878       Goodnite Charlie  2005    119 100000    9.8    34     0
## 3  37176                 Nun Fu  2005      5   5000    9.8     5     0
## 4  37262              Oath, The  2005     23     NA    9.8     5     0
## 5  56370 Weg ist das Spiel, Der  2005      3     NA    9.8     8     0
## 6  34472             Morphin(e)  2005     20   8000    9.7     7     0
## 7   6133           Blaze Orange  2005     16      0    9.6     7     0
## 8  17759     Filmic Achievement  2005     80     NA    9.6    10     0
## 9  20875         Goodnight Bill  2005     19     NA    9.6     7     0
## 10 57067        Wild Girls Gone  2005     93     NA    9.6     7     0
## # ... with 58,778 more rows, and 17 more variables: r2 <dbl>, r3 <dbl>,
## #   r4 <dbl>, r5 <dbl>, r6 <dbl>, r7 <dbl>, r8 <dbl>, r9 <dbl>, r10 <dbl>,
## #   mpaa <chr>, Action <int>, Animation <int>, Comedy <int>, Drama <int>,
## #   Documentary <int>, Romance <int>, Short <int>

#-----------------------------------
#Another example using mtcars
names(mtcars)

##  [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear"
## [11] "carb"

head( arrange(mtcars, mpg) )

##    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1 10.4   8  472 205 2.93 5.250 17.98  0  0    3    4
## 2 10.4   8  460 215 3.00 5.424 17.82  0  0    3    4
## 3 13.3   8  350 245 3.73 3.840 15.41  0  0    3    4
## 4 14.3   8  360 245 3.21 3.570 15.84  0  0    3    4
## 5 14.7   8  440 230 3.23 5.345 17.42  0  0    3    4
## 6 15.0   8  301 335 3.54 3.570 14.60  0  1    5    8

head(arrange(mtcars, desc(mpg)) )

##    mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## 2 32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## 3 30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## 4 30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## 5 27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## 6 26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2

MUTATE

#Creates a new column
head(mutate(iris, aspect_ratio = Petal.Width/Petal.Length) )

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species id
## 1          5.1         3.5          1.4         0.2  setosa  1
## 2          4.9         3.0          1.4         0.2  setosa  2
## 3          4.7         3.2          1.3         0.2  setosa  3
## 4          4.6         3.1          1.5         0.2  setosa  4
## 5          5.0         3.6          1.4         0.2  setosa  5
## 6          5.4         3.9          1.7         0.4  setosa  6
##   aspect_ratio
## 1    0.1428571
## 2    0.1428571
## 3    0.1538462
## 4    0.1333333
## 5    0.1428571
## 6    0.2352941

movies_with_budgets <- filter(movies_df, !is.na(budget))
mutate(movies_with_budgets, costPerMinute = budget/length) %>%
  select(title, costPerMinute)

## # A tibble: 5,215 × 2
##                         title costPerMinute
##                         <chr>         <dbl>
## 1                     'G' Men     5294.1176
## 2   'Manos' the Hands of Fate      256.7568
## 3          'Til There Was You   203539.8230
## 4             .com for Murder    52083.3333
## 5  10 Things I Hate About You   164948.4536
## 6               100 Mile Rule    11224.4898
## 7                   100 Proof     1489.3617
## 8                         101     1709.4017
## 9            101-vy kilometer     1941.7476
## 10             102 Dalmatians   850000.0000
## # ... with 5,205 more rows

GROUP_BY AND SUMMARISE

# This seems how roup_by works...
by_rating <- group_by(movies_df, rating)
head(movies_df)

## # A tibble: 6 × 25
##       X                    title  year length budget rating votes    r1
##   <int>                    <chr> <int>  <int>  <int>  <dbl> <int> <dbl>
## 1     1                        $  1971    121     NA    6.4   348   4.5
## 2     2        $1000 a Touchdown  1939     71     NA    6.0    20   0.0
## 3     3   $21 a Day Once a Month  1941      7     NA    8.2     5   0.0
## 4     4                  $40,000  1996     70     NA    8.2     6  14.5
## 5     5 $50,000 Climax Show, The  1975     71     NA    3.4    17  24.5
## 6     6                    $pent  2000     91     NA    4.3    45   4.5
## # ... with 17 more variables: r2 <dbl>, r3 <dbl>, r4 <dbl>, r5 <dbl>,
## #   r6 <dbl>, r7 <dbl>, r8 <dbl>, r9 <dbl>, r10 <dbl>, mpaa <chr>,
## #   Action <int>, Animation <int>, Comedy <int>, Drama <int>,
## #   Documentary <int>, Romance <int>, Short <int>

by_rating

## Source: local data frame [58,788 x 25]
## Groups: rating [91]
## 
##        X                    title  year length budget rating votes    r1
##    <int>                    <chr> <int>  <int>  <int>  <dbl> <int> <dbl>
## 1      1                        $  1971    121     NA    6.4   348   4.5
## 2      2        $1000 a Touchdown  1939     71     NA    6.0    20   0.0
## 3      3   $21 a Day Once a Month  1941      7     NA    8.2     5   0.0
## 4      4                  $40,000  1996     70     NA    8.2     6  14.5
## 5      5 $50,000 Climax Show, The  1975     71     NA    3.4    17  24.5
## 6      6                    $pent  2000     91     NA    4.3    45   4.5
## 7      7                  $windle  2002     93     NA    5.3   200   4.5
## 8      8                     '15'  2002     25     NA    6.7    24   4.5
## 9      9                      '38  1987     97     NA    6.6    18   4.5
## 10    10                  '49-'17  1917     61     NA    6.0    51   4.5
## # ... with 58,778 more rows, and 17 more variables: r2 <dbl>, r3 <dbl>,
## #   r4 <dbl>, r5 <dbl>, r6 <dbl>, r7 <dbl>, r8 <dbl>, r9 <dbl>, r10 <dbl>,
## #   mpaa <chr>, Action <int>, Animation <int>, Comedy <int>, Drama <int>,
## #   Documentary <int>, Romance <int>, Short <int>

by_year <- group_by(movies_df, year)
by_year

## Source: local data frame [58,788 x 25]
## Groups: year [113]
## 
##        X                    title  year length budget rating votes    r1
##    <int>                    <chr> <int>  <int>  <int>  <dbl> <int> <dbl>
## 1      1                        $  1971    121     NA    6.4   348   4.5
## 2      2        $1000 a Touchdown  1939     71     NA    6.0    20   0.0
## 3      3   $21 a Day Once a Month  1941      7     NA    8.2     5   0.0
## 4      4                  $40,000  1996     70     NA    8.2     6  14.5
## 5      5 $50,000 Climax Show, The  1975     71     NA    3.4    17  24.5
## 6      6                    $pent  2000     91     NA    4.3    45   4.5
## 7      7                  $windle  2002     93     NA    5.3   200   4.5
## 8      8                     '15'  2002     25     NA    6.7    24   4.5
## 9      9                      '38  1987     97     NA    6.6    18   4.5
## 10    10                  '49-'17  1917     61     NA    6.0    51   4.5
## # ... with 58,778 more rows, and 17 more variables: r2 <dbl>, r3 <dbl>,
## #   r4 <dbl>, r5 <dbl>, r6 <dbl>, r7 <dbl>, r8 <dbl>, r9 <dbl>, r10 <dbl>,
## #   mpaa <chr>, Action <int>, Animation <int>, Comedy <int>, Drama <int>,
## #   Documentary <int>, Romance <int>, Short <int>

# Let's see the histogram of movies by rating, by year
head(by_rating %>% summarize(n()) )

## # A tibble: 6 × 2
##   rating `n()`
##    <dbl> <int>
## 1    1.0   106
## 2    1.1    44
## 3    1.2    36
## 4    1.3    37
## 5    1.4    49
## 6    1.5    59

by_year %>% summarize(avg_rating = mean(rating))

## # A tibble: 113 × 2
##     year avg_rating
##    <int>      <dbl>
## 1   1893   7.000000
## 2   1894   4.888889
## 3   1895   5.500000
## 4   1896   5.269231
## 5   1897   4.677778
## 6   1898   5.040000
## 7   1899   4.277778
## 8   1900   4.731250
## 9   1901   4.682143
## 10  1902   4.900000
## # ... with 103 more rows

head( summarise(by_rating,
          num_movies_by_rating = n(),
          average_length = mean(length))
      )

## # A tibble: 6 × 3
##   rating num_movies_by_rating average_length
##    <dbl>                <int>          <dbl>
## 1    1.0                  106       64.91509
## 2    1.1                   44       70.65909
## 3    1.2                   36       66.47222
## 4    1.3                   37       73.67568
## 5    1.4                   49       65.87755
## 6    1.5                   59       74.47458

movies %>%
  group_by(year) %>%
  summarize(MoviesPerYear=n())

## # A tibble: 113 × 2
##     year MoviesPerYear
##    <int>         <int>
## 1   1893             1
## 2   1894             9
## 3   1895             3
## 4   1896            13
## 5   1897             9
## 6   1898             5
## 7   1899             9
## 8   1900            16
## 9   1901            28
## 10  1902             9
## # ... with 103 more rows

    avg_rating_by_year <- 
      group_by(movies_df, year) %>%
      summarize(avg_rating = mean(rating))

avg_rating_by_year

## # A tibble: 113 × 2
##     year avg_rating
##    <int>      <dbl>
## 1   1893   7.000000
## 2   1894   4.888889
## 3   1895   5.500000
## 4   1896   5.269231
## 5   1897   4.677778
## 6   1898   5.040000
## 7   1899   4.277778
## 8   1900   4.731250
## 9   1901   4.682143
## 10  1902   4.900000
## # ... with 103 more rows

## GROUP_BY EXAMPLE #2
titanic <- as.data.frame(Titanic)
titanic #flattened table... with 32 rows and 5 columns

##    Class    Sex   Age Survived Freq
## 1    1st   Male Child       No    0
## 2    2nd   Male Child       No    0
## 3    3rd   Male Child       No   35
## 4   Crew   Male Child       No    0
## 5    1st Female Child       No    0
## 6    2nd Female Child       No    0
## 7    3rd Female Child       No   17
## 8   Crew Female Child       No    0
## 9    1st   Male Adult       No  118
## 10   2nd   Male Adult       No  154
## 11   3rd   Male Adult       No  387
## 12  Crew   Male Adult       No  670
## 13   1st Female Adult       No    4
## 14   2nd Female Adult       No   13
## 15   3rd Female Adult       No   89
## 16  Crew Female Adult       No    3
## 17   1st   Male Child      Yes    5
## 18   2nd   Male Child      Yes   11
## 19   3rd   Male Child      Yes   13
## 20  Crew   Male Child      Yes    0
## 21   1st Female Child      Yes    1
## 22   2nd Female Child      Yes   13
## 23   3rd Female Child      Yes   14
## 24  Crew Female Child      Yes    0
## 25   1st   Male Adult      Yes   57
## 26   2nd   Male Adult      Yes   14
## 27   3rd   Male Adult      Yes   75
## 28  Crew   Male Adult      Yes  192
## 29   1st Female Adult      Yes  140
## 30   2nd Female Adult      Yes   80
## 31   3rd Female Adult      Yes   76
## 32  Crew Female Adult      Yes   20

names(titanic) #Freq is the column that shows number of people

## [1] "Class"    "Sex"      "Age"      "Survived" "Freq"

# How many people in each class in Titanic?
titanic %>%
  group_by(Class) %>%
  summarize(TotalByClass = sum(Freq))

## # A tibble: 4 × 2
##    Class TotalByClass
##   <fctr>        <dbl>
## 1    1st          325
## 2    2nd          285
## 3    3rd          706
## 4   Crew          885

titanic %>%  group_by(Class, Survived) %>%  summarize(TotalByClassBySurvived = sum(Freq))

## Source: local data frame [8 x 3]
## Groups: Class [?]
## 
##    Class Survived TotalByClassBySurvived
##   <fctr>   <fctr>                  <dbl>
## 1    1st       No                    122
## 2    1st      Yes                    203
## 3    2nd       No                    167
## 4    2nd      Yes                    118
## 5    3rd       No                    528
## 6    3rd      Yes                    178
## 7   Crew       No                    673
## 8   Crew      Yes                    212

titanic %>%  group_by(Sex, Survived) %>%  summarize(TotalByGenderBySurvived = sum(Freq))

## Source: local data frame [4 x 3]
## Groups: Sex [?]
## 
##      Sex Survived TotalByGenderBySurvived
##   <fctr>   <fctr>                   <dbl>
## 1   Male       No                    1364
## 2   Male      Yes                     367
## 3 Female       No                     126
## 4 Female      Yes                     344

titanic %>%  group_by(Class, Sex, Survived) %>%  summarize(GroupTotal = sum(Freq))

## Source: local data frame [16 x 4]
## Groups: Class, Sex [?]
## 
##     Class    Sex Survived GroupTotal
##    <fctr> <fctr>   <fctr>      <dbl>
## 1     1st   Male       No        118
## 2     1st   Male      Yes         62
## 3     1st Female       No          4
## 4     1st Female      Yes        141
## 5     2nd   Male       No        154
## 6     2nd   Male      Yes         25
## 7     2nd Female       No         13
## 8     2nd Female      Yes         93
## 9     3rd   Male       No        422
## 10    3rd   Male      Yes         88
## 11    3rd Female       No        106
## 12    3rd Female      Yes         90
## 13   Crew   Male       No        670
## 14   Crew   Male      Yes        192
## 15   Crew Female       No          3
## 16   Crew Female      Yes         20

#----------------------------------------------------------

#Example of grouping by TWO variables...
mtcars %>%
  group_by(cyl, gear) %>%
  summarize(datapoints = n())

## Source: local data frame [8 x 3]
## Groups: cyl [?]
## 
##     cyl   gear datapoints
##   <dbl> <fctr>      <int>
## 1     4      3          1
## 2     4      4          8
## 3     4      5          2
## 4     6      3          2
## 5     6      4          4
## 6     6      5          1
## 7     8      3         12
## 8     8      5          2

head(
    mtcars %>%
        group_by(cyl, gear) %>%
        mutate(cyl_gear_count = n()) #Mutate adds a new variable. n() is the count for that group.
    )

## Source: local data frame [6 x 12]
## Groups: cyl, gear [4]
## 
##     mpg   cyl  disp    hp  drat    wt  qsec    vs    am   gear  carb
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fctr> <dbl>
## 1  21.0     6   160   110  3.90 2.620 16.46     0     1      4     4
## 2  21.0     6   160   110  3.90 2.875 17.02     0     1      4     4
## 3  22.8     4   108    93  3.85 2.320 18.61     1     1      4     1
## 4  21.4     6   258   110  3.08 3.215 19.44     1     0      3     1
## 5  18.7     8   360   175  3.15 3.440 17.02     0     0      3     2
## 6  18.1     6   225   105  2.76 3.460 20.22     1     0      3     1
## # ... with 1 more variables: cyl_gear_count <int>

# Advanced - multiple summarize  
mtcars %>%
  group_by(cyl, gear) %>%
  summarize(datapoints = n()) %>%
  summarize(datapoints = n())

## # A tibble: 3 × 2
##     cyl datapoints
##   <dbl>      <int>
## 1     4          3
## 2     6          3
## 3     8          2

# Notice that there are two summrizes here.
# "When you group by multiple variables, 
# each summary peels off one level of the grouping."
# Therfore, grouping by gear is gone after the second summarize.


## EXTRAS

# Chaining...
producers_nightmare <- filter(movies_df, !is.na(budget), year>1990) %>%
  mutate(costPerMinute = budget/length) %>%
  arrange(desc(costPerMinute)) %>%
  select(title, year, costPerMinute)

producers_nightmare

## # A tibble: 3,120 × 3
##                                              title  year costPerMinute
##                                              <chr> <int>         <dbl>
## 1  Star Trek the Experience: The Klingon Encounter  1998      17500000
## 2                   Back to the Future... The Ride  1991      10000000
## 3                       T2 3-D: Battle Across Time  1996       5000000
## 4                                           Tarzan  1999       1704545
## 5               Terminator 3: Rise of the Machines  2003       1605505
## 6                                  Men in Black II  2002       1590909
## 7                                   Wild Wild West  1999       1588785
## 8                                     Spider-Man 2  2004       1574803
## 9                                         Dinosaur  2000       1554878
## 10                              Polar Express, The  2004       1515152
## # ... with 3,110 more rows

## PLOTTING
head(
    movies %>%
        group_by(rating) %>%
        summarize(n())
    )

## # A tibble: 6 × 2
##   rating `n()`
##    <dbl> <int>
## 1    1.0   106
## 2    1.1    44
## 3    1.2    36
## 4    1.3    37
## 5    1.4    49
## 6    1.5    59

# you can even pipe it to Plot...
# movies %>%
#  group_by(rating) %>%
#  summarize(n()) %>%
#  plot() # plots the histogram of movies by Each value of rating

# movies %>%
#  group_by(year) %>%
#  summarise(y=mean(rating)) %>%
#  with(barplot(y, names.arg=year, main="AVG IMDB Rating by Year"))


#----------------------------------------------------------
# Extras

# EXAMPLE of GROUP_BY (tapply in base R does the same thing.)
# What if we wanted to calculate the average rating for each mpaa rating?
# Are PG movies rated higher than R rated movies?
movies %>%
  filter(mpaa != "", length>80) %>%
  group_by(mpaa) %>%
  summarize(mean(rating))

## # A tibble: 4 × 2
##    mpaa `mean(rating)`
##   <chr>          <dbl>
## 1 NC-17       5.362500
## 2    PG       5.636456
## 3 PG-13       5.796939
## 4     R       5.445374

#compare against
tapply(movies$rating, movies$mpaa, mean)

##             NC-17       PG    PG-13        R 
## 5.970537 5.362500 5.605492 5.800698 5.424874

#Try this yourself first:
#Can you print the top 2 movies (by rating) for each year?

#SPOLIER ALERT:
#SHOW THE TOP 2 MOVIES for each year
movies %>%
  group_by(year) %>%
  filter(year>2000) %>%
  arrange(year, desc(rating)) %>%
  filter(row_number(year) <=2) %>%
  select(year, rating, title)

## Source: local data frame [10 x 3]
## Groups: year [5]
## 
##     year rating                                   title
##    <int>  <dbl>                                   <chr>
## 1   2001   10.0                        Fishing for Love
## 2   2001    9.8                             Martin Four
## 3   2002    9.8                         Copi, je t'aime
## 4   2002    9.7                      Thanks for Nothing
## 5   2003    9.8                          I Am My Resume
## 6   2003    9.8                               TankUp.US
## 7   2004   10.0 Dimensia Minds Trilogy: The Hope Factor
## 8   2004   10.0                        Summer Sonata, A
## 9   2005    9.9                      Keeper of the Past
## 10  2005    9.8                        Goodnite Charlie

Pipe into Plot

names(movies)

##  [1] "X"           "title"       "year"        "length"      "budget"     
##  [6] "rating"      "votes"       "r1"          "r2"          "r3"         
## [11] "r4"          "r5"          "r6"          "r7"          "r8"         
## [16] "r9"          "r10"         "mpaa"        "Action"      "Animation"  
## [21] "Comedy"      "Drama"       "Documentary" "Romance"     "Short"

head(movies)

##   X                    title year length budget rating votes   r1   r2  r3
## 1 1                        $ 1971    121     NA    6.4   348  4.5  4.5 4.5
## 2 2        $1000 a Touchdown 1939     71     NA    6.0    20  0.0 14.5 4.5
## 3 3   $21 a Day Once a Month 1941      7     NA    8.2     5  0.0  0.0 0.0
## 4 4                  $40,000 1996     70     NA    8.2     6 14.5  0.0 0.0
## 5 5 $50,000 Climax Show, The 1975     71     NA    3.4    17 24.5  4.5 0.0
## 6 6                    $pent 2000     91     NA    4.3    45  4.5  4.5 4.5
##     r4   r5   r6   r7   r8   r9  r10 mpaa Action Animation Comedy Drama
## 1  4.5 14.5 24.5 24.5 14.5  4.5  4.5           0         0      1     1
## 2 24.5 14.5 14.5 14.5  4.5  4.5 14.5           0         0      1     0
## 3  0.0  0.0 24.5  0.0 44.5 24.5 24.5           0         1      0     0
## 4  0.0  0.0  0.0  0.0  0.0 34.5 45.5           0         0      1     0
## 5 14.5 14.5  4.5  0.0  0.0  0.0 24.5           0         0      0     0
## 6 14.5 14.5 14.5  4.5  4.5 14.5 14.5           0         0      0     1
##   Documentary Romance Short
## 1           0       0     0
## 2           0       0     0
## 3           0       0     1
## 4           0       0     0
## 5           0       0     0
## 6           0       0     0

df1 <- movies %>%  group_by(rating) %>% summarize(n())
head(df1)

## # A tibble: 6 × 2
##   rating `n()`
##    <dbl> <int>
## 1    1.0   106
## 2    1.1    44
## 3    1.2    36
## 4    1.3    37
## 5    1.4    49
## 6    1.5    59

df1 %>%   plot()

# plots the histogram of movies by Each value of rating!

movies %>%
    group_by(year) %>%
    summarise(y=mean(rating)) %>%
    with(barplot(y, names.arg=year, main="AVG IMDB Rating by Year"))

Class3Ex_Data_Manipulation

Andrew Zhang

Monday, November 10, 2014, updated on 06/04/2017

Numbers

Not a Number

DEDUPING - Removing DUPLICATED VALUES

Convert strings to numbers

factors in R

Data Manipulation

USE of SUBSET() COMMAND - seesm like “SELECT” of SQL

Aggregating using TABLE

X T A B S # C ROSS TABULATION

xtabs Uses the ‘formula syntax’

AGGREGATE

C U T

A P P L Y Family of Functions

T A P P L Y

M E R G E

We want to fill in all missing hrs from 1 to 10, and fill in the value 0 for those.

TIDY DATA: RESHAPING (ref. p8 of ITDA_Lecture+4a)

Wide Data: good for human to read; Long data: good for R

DATA MANIPULATION WITH DPLYR

FILTER

S E L E C T

ARRANGE - REORDER (SORT) ROWS

MUTATE

GROUP_BY AND SUMMARISE

Pipe into Plot