Data wrangling

install.packages(c('dplyr','data.table'))

library(data.table) # melt & aggregate data
library(dplyr) # data manipulation
install.packages('tidyverse')
library(tidyverse) #includes ggplot2, dplyr, tidy, + more


Burrito data: https://srcole.github.io/100burritos/

url<-'https://raw.githubusercontent.com/collnell/burritos/master/sd_burritos.csv'
ritos <- fread(url)

head(ritos)
                 Location    Burrito    Date Neighborhood
1:   graciela's taco shop    adobada 2/14/16  chula vista
2:   graciela's taco shop   barbacoa 2/14/16  chula vista
3:    cortez mexican food california 10/7/16      bonsall
4: el pueblo mexican food california 8/10/16      cardiff
5:           pollos maria california 1/27/16     carlsbad
6:         senor grubby's california 4/24/16     carlsbad
                   Address Yelp Google Chips Cost Hunger Length Circum
1:        5047 Central Ave  4.0    4.5    No 5.99      3     NA     NA
2:        5047 Central Ave  4.0    4.5    No 5.99      2     NA     NA
3:         5517 Mission Rd  4.2    4.0    No 6.25      4   22.5   18.0
4:       820 Birmingham Dr  4.0    4.7    No 4.99      4   19.0   21.0
5:         3055 Harding St  4.0    3.8   Yes 6.59      4     NA     NA
6: 377 Carlsbad Village Dr  4.0    4.1    No 9.00      2   19.0   21.5
   Volume Tortilla Temp Meat Fillings Meat.filling Uniformity Salsa
1:     NA      4.0  4.0  3.0      3.5          4.0        4.5   4.0
2:     NA      3.5  4.0  3.5       NA          4.0         NA   4.0
3:   0.58      3.5  4.0  2.5      3.0          1.5        2.5   2.5
4:   0.67      4.5  4.5  3.5      4.0          4.5        5.0   2.5
5:     NA      4.0  5.0  4.0      3.5          4.5        5.0   2.5
6:   0.70      2.0  3.5  3.0      1.5          1.0        1.0   2.5
   Synergy Wrap Taste Rec Reviewer
1:     4.0  4.5   4.0  NA    Scott
2:     4.0  1.5   4.0  NA    Emily
3:     2.8  5.0   3.2  no    Scott
4:     4.5  5.0   4.3 yes    Scott
5:     4.5  4.0   4.2  NA    Scott
6:     1.5  3.5   1.5  no    Scott
                                                                                Notes
1:                                                                  Salsa verde spicy
2:                                                                                   
3:                                                                                   
4:                                                                                   
5:                                                                                   
6: The ends of the burrito were decent, but the middle was terrible without much meat
   Unreliable NonSD Beef Pico Guac Cheese Fries Sour.cream Pork Chicken
1:                0   NA   NA   NA     NA    NA         NA    1      NA
2:                0    1   NA   NA     NA    NA         NA   NA      NA
3:                0    1    1    1      1     1          1   NA      NA
4:                0    1    1   NA      1     1         NA   NA      NA
5:                0    1    1   NA      1     1         NA   NA      NA
6:                0    1    1   NA      1     1         NA   NA      NA
   Shrimp Fish Rice Beans Lettuce Tomato Bell.peper Carrots Cabbage Sauce
1:     NA   NA   NA     1      NA     NA         NA      NA      NA    NA
2:     NA   NA   NA    NA      NA     NA         NA      NA      NA    NA
3:     NA   NA   NA    NA      NA     NA         NA      NA      NA    NA
4:     NA   NA   NA    NA      NA     NA         NA      NA      NA    NA
5:     NA   NA   NA    NA      NA     NA         NA      NA      NA    NA
6:     NA   NA   NA    NA      NA     NA         NA      NA      NA    NA
   Salsa.1 Cilantro Onion Taquito Pineapple Ham Chile.relleno Nopales
1:       1        1     1      NA        NA  NA            NA      NA
2:      NA        1     1      NA        NA  NA            NA      NA
3:      NA       NA    NA      NA        NA  NA            NA      NA
4:      NA       NA    NA      NA        NA  NA            NA      NA
5:      NA       NA    NA      NA        NA  NA            NA      NA
6:      NA       NA    NA      NA        NA  NA            NA      NA
   Lobster Queso Egg Mushroom Bacon Sushi Avocado Corn Zucchini
1:      NA    NA  NA       NA    NA    NA      NA   NA       NA
2:      NA    NA  NA       NA    NA    NA      NA   NA       NA
3:      NA    NA  NA       NA    NA    NA      NA   NA       NA
4:      NA    NA  NA       NA    NA    NA      NA   NA       NA
5:      NA    NA  NA       NA    NA    NA      NA   NA       NA
6:      NA    NA  NA       NA    NA    NA      NA   NA       NA

dplyr

Verbs

filter

Filter data to San Diego:

dim(ritos)
[1] 340  63
ritos<-filter(ritos, NonSD == 0) 
dim(ritos)  
[1] 332  63
length(unique(ritos$Burrito)) #different kinds of burritos
[1] 90

Filter data to only California burritos:

ca<-filter(ritos, grepl('california', ritos$Burrito)) #pattern matching

length(unique(ca$Burrito))
[1] 20

select

Select columns of a dataframe using variable names

df<-select(ritos, Location, Yelp)
head(df)
                Location Yelp
1   graciela's taco shop  4.0
2   graciela's taco shop  4.0
3    cortez mexican food  4.2
4 el pueblo mexican food  4.0
5           pollos maria  4.0
6         senor grubby's  4.0
df<-select(ritos, Tortilla:Wrap) #columns with burrito ratings 
head(df)
  Tortilla Temp Meat Fillings Meat.filling Uniformity Salsa Synergy Wrap
1      4.0  4.0  3.0      3.5          4.0        4.5   4.0     4.0  4.5
2      3.5  4.0  3.5       NA          4.0         NA   4.0     4.0  1.5
3      3.5  4.0  2.5      3.0          1.5        2.5   2.5     2.8  5.0
4      4.5  4.5  3.5      4.0          4.5        5.0   2.5     4.5  5.0
5      4.0  5.0  4.0      3.5          4.5        5.0   2.5     4.5  4.0
6      2.0  3.5  3.0      1.5          1.0        1.0   2.5     1.5  3.5
#drop columns
df<-select(ritos, -Salsa, -Synergy, -Wrap)
colnames(df)
 [1] "Location"      "Burrito"       "Date"          "Neighborhood" 
 [5] "Address"       "Yelp"          "Google"        "Chips"        
 [9] "Cost"          "Hunger"        "Length"        "Circum"       
[13] "Volume"        "Tortilla"      "Temp"          "Meat"         
[17] "Fillings"      "Meat.filling"  "Uniformity"    "Taste"        
[21] "Rec"           "Reviewer"      "Notes"         "Unreliable"   
[25] "NonSD"         "Beef"          "Pico"          "Guac"         
[29] "Cheese"        "Fries"         "Sour.cream"    "Pork"         
[33] "Chicken"       "Shrimp"        "Fish"          "Rice"         
[37] "Beans"         "Lettuce"       "Tomato"        "Bell.peper"   
[41] "Carrots"       "Cabbage"       "Sauce"         "Salsa.1"      
[45] "Cilantro"      "Onion"         "Taquito"       "Pineapple"    
[49] "Ham"           "Chile.relleno" "Nopales"       "Lobster"      
[53] "Queso"         "Egg"           "Mushroom"      "Bacon"        
[57] "Sushi"         "Avocado"       "Corn"          "Zucchini"     

mutate

Add new variables using existing columns

df<-select(ritos, Taste, Rec, Cost, Volume)
df<-mutate(df, Cost_vol = Cost / Volume) #average rating
head(df)
  Taste  Rec Cost Volume  Cost_vol
1   4.0 <NA> 5.99     NA        NA
2   4.0 <NA> 5.99     NA        NA
3   3.2   no 6.25   0.58 10.775862
4   4.3  yes 4.99   0.67  7.447761
5   4.2 <NA> 6.59     NA        NA
6   1.5   no 9.00   0.70 12.857143

ggplot(df, aes(x=Cost_vol, y=Taste))+
  geom_point(aes(color=Rec))+
  labs(x='Cost by Volume')

%>% pipe

Combine multiple operations in series
- Easy to read, reduces nesting
- Create fewer dataframes

cali <- ritos %>%
  filter(NonSD == 0, grepl('california', ritos$Burrito))%>%
  select(-NonSD)%>%
  mutate(Cost_vol = Cost / Volume)

group_by & summarize

Summarize multiple rows

##find the mean
summarize(ritos, mean(Cost, na.rm = TRUE))
  mean(Cost, na.rm = TRUE)
1                 6.932308
summarize(ritos, sd(Cost, na.rm = TRUE))
  sd(Cost, na.rm = TRUE)
1                1.19633
#Uniformity by Rec
ritos %>% group_by(Rec) %>%
  summarize(mean(Uniformity, na.rm = TRUE))
# A tibble: 3 × 2
    Rec `mean(Uniformity, na.rm = TRUE)`
  <chr>                            <dbl>
1    no                         2.881250
2   yes                         3.691720
3  <NA>                         3.341284
#summarize multiple columns
ritos %>%   select(Rec, Yelp, Google) %>%
  group_by(Rec) %>%
  summarize_each(funs(mean(., na.rm = TRUE)))
# A tibble: 3 × 3
    Rec     Yelp   Google
  <chr>    <dbl>    <dbl>
1    no 3.718462 4.100000
2   yes 3.969799 4.270922
3  <NA> 3.773148 4.211765

California burrito

California burrito = carne asada + fries

Create a new dataframe from ‘cali’
- Burritos containing beef & fries
- Find mean, standard error, number of burritos for Taste variable

Hint:

#standard error of the mean x
sem <- function(x) sd(x, na.rm = TRUE)/sqrt(length(x))

arrange

Reorder rows by column values

Where is the best California burrito?

best<-best%>%
  filter(n > 1)%>% #remove locations visited only once
  arrange(quality) 

head(best)
Source: local data frame [6 x 5]
Groups: Neighborhood [5]

     Neighborhood                       Location  quality     n         se
            <chr>                          <chr>    <dbl> <int>      <dbl>
1             utc        los primos mexican food 2.675000     4 0.42106017
2 university city      papa chito's mexican food 2.975000     2 0.77500000
3     kearny mesa             lolita's taco shop 3.150000     4 0.35237291
4        la jolla rigoberto's taco shop la jolla 3.250000     2 0.25000000
5         miramar      tony's fresh mexican food 3.325000     2 0.57500000
6         miramar     california burrito company 3.433333     3 0.06666667

ggplot(data = best, aes(x = reorder(Location, quality), y = quality))+
  geom_bar(stat = 'identity', fill='gold')+
  geom_errorbar(aes(ymin = quality - se, ymax = quality + se), width = .4)+
  coord_flip()+
  labs(title = 'The best burrito', y = 'Burrito Quality', x ='Location')+
  theme_classic()+
  theme(legend.position = 'none')