Ticket Sales

Ex1: Importing the data

# Import sales.csv: sales

download.file("http://s3.amazonaws.com/assets.datacamp.com/production/course_1294/datasets/sales.csv",
              "sales.csv")
sales <- read.csv("sales.csv", stringsAsFactors = FALSE)
                          # stringsAsFactors argument to FALSE 
                          # character strings are preserved

# alternatively,
url_sales <- 'http://s3.amazonaws.com/assets.datacamp.com/production/course_1294/datasets/sales.csv'
sales <- read.csv(url_sales, stringsAsFactors=F)
                          # stringsAsFactors argument to FALSE 
                          # character strings are preserved

Ex2: Examining the data

dim()

dim()

# View dimensions of sales
dim(sales)
## [1] 5000   46

head()

head()

# Inspect first 6 rows of sales
head(sales, 6)
##   X             event_id       primary_act_id     secondary_act_id
## 1 1 abcaf1adb99a935fc661 43f0436b905bfa7c2eec b85143bf51323b72e53c
## 2 2 6c56d7f08c95f2aa453c 1a3e9aecd0617706a794 f53529c5679ea6ca5a48
## 3 3 c7ab4524a121f9d687d2 4b677c3f5bec71eec8d1 b85143bf51323b72e53c
## 4 4 394cb493f893be9b9ed1 b1ccea01ad6ef8522796 b85143bf51323b72e53c
## 5 5 55b5f67e618557929f48 91c03a34b562436efa3c b85143bf51323b72e53c
## 6 6 4f10fd8b9f550352bd56 ac4b847b3fde66f2117e 63814f3d63317f1b56c4
##    purch_party_lkup_id
## 1 7dfa56dd7d5956b17587
## 2 4f9e6fc637eaf7b736c2
## 3 6c2545703bd527a7144d
## 4 527d6b1eaffc69ddd882
## 5 8bd62c394a35213bdf52
## 6 3b3a628f83135acd0676
##                                                       event_name
## 1 Xfinity Center Mansfield Premier Parking: Florida Georgia Line
## 2                  Gorge Camping - dave matthews band - sept 3-7
## 3                    Dodge Theatre Adams Street Parking - benise
## 4   Gexa Energy Pavilion Vip Parking : kid rock with sheryl crow
## 5                                  Premier Parking - motley crue
## 6                                      Fast Lane Access: Journey
##                           primary_act_name secondary_act_name
## 1 XFINITY Center Mansfield Premier Parking               NULL
## 2                            Gorge Camping Dave Matthews Band
## 3                            Parking Event               NULL
## 4         Gexa Energy Pavilion VIP Parking               NULL
## 5 White River Amphitheatre Premier Parking               NULL
## 6                         Fast Lane Access            Journey
##   major_cat_name         minor_cat_name la_event_type_cat
## 1           MISC                PARKING           PARKING
## 2           MISC                CAMPING           INVALID
## 3           MISC                PARKING           PARKING
## 4           MISC                PARKING           PARKING
## 5           MISC                PARKING           PARKING
## 6           MISC SPECIAL ENTRY (UPSELL)            UPSELL
##                                                  event_disp_name
## 1 Xfinity Center Mansfield Premier Parking: Florida Georgia Line
## 2                  Gorge Camping - dave matthews band - sept 3-7
## 3                    Dodge Theatre Adams Street Parking - benise
## 4   Gexa Energy Pavilion Vip Parking : kid rock with sheryl crow
## 5                                  Premier Parking - motley crue
## 6                                      Fast Lane Access: Journey
##                                                                                                                                                    ticket_text
## 1    THIS TICKET IS VALID        FOR PARKING ONLY         GOOD THIS DAY ONLY       PREMIER PARKING PASS    XFINITY CENTER,LOTS 4 PM  SAT SEP 12 2015 7:30 PM  
## 2                                                                %OVERNIGHT C A M P I N G%* * * * * *%GORGE CAMPGROUND%* GOOD THIS DATE ONLY *%SEP 3 - 6, 2009
## 3                               ADAMS STREET GARAGE%PARKING FOR 4/21/06 ONLY%DODGE THEATRE PARKING PASS%ENTRANCE ON ADAMS STREET%BENISE%GARAGE OPENS AT 6:00PM
## 4    THIS TICKET IS VALID        FOR PARKING ONLY      GOOD FOR THIS DATE ONLY       VIP PARKING PASS        GEXA ENERGY PAVILION    FRI SEP 02 2011 7:00 PM  
## 5                              THIS TICKET IS VALID%FOR PARKING ONLY%GOOD THIS DATE ONLY%PREMIER PARKING PASS%WHITE RIVER AMPHITHEATRE%SAT JUL 30, 2005 6:00PM
## 6         FAST LANE                  JOURNEY               FAST LANE EVENT         THIS IS NOT A TICKET    SAN MANUEL AMPHITHEATER   SAT JUL 21 2012 7:00 PM  
##   tickets_purchased_qty trans_face_val_amt delivery_type_cd
## 1                     1                 45          eTicket
## 2                     1                 75       TicketFast
## 3                     1                  5       TicketFast
## 4                     1                 20             Mail
## 5                     1                 20             Mail
## 6                     2                 10       TicketFast
##       event_date_time   event_dt presale_dt  onsale_dt
## 1 2015-09-12 23:30:00 2015-09-12       NULL 2015-05-15
## 2 2009-09-05 01:00:00 2009-09-04       NULL 2009-03-13
## 3 2006-04-22 01:30:00 2006-04-21       NULL 2006-02-25
## 4 2011-09-03 00:00:00 2011-09-02       NULL 2011-04-22
## 5 2005-07-31 01:00:00 2005-07-30 2005-03-02 2005-03-04
## 6 2012-07-22 02:00:00 2012-07-21       NULL 2012-04-11
##   sales_ord_create_dttm sales_ord_tran_dt   print_dt timezn_nm
## 1   2015-09-11 18:17:45        2015-09-11 2015-09-12       EST
## 2   2009-07-06 00:00:00        2009-07-05 2009-09-01       PST
## 3   2006-04-05 00:00:00        2006-04-05 2006-04-05       MST
## 4   2011-07-01 17:38:50        2011-07-01 2011-07-06       CST
## 5   2005-06-18 00:00:00        2005-06-18 2005-06-28       PST
## 6   2012-07-21 17:20:18        2012-07-21 2012-07-21       PST
##       venue_city   venue_state venue_postal_cd_sgmt_1
## 1      MANSFIELD MASSACHUSETTS                  02048
## 2         QUINCY    WASHINGTON                  98848
## 3        PHOENIX       ARIZONA                  85003
## 4         DALLAS         TEXAS                  75210
## 5         AUBURN    WASHINGTON                  98092
## 6 SAN BERNARDINO    CALIFORNIA                  92407
##             sales_platform_cd print_flg la_valid_tkt_event_flg  fin_mkt_nm
## 1 www.concerts.livenation.com        T                      N       Boston
## 2                        NULL        T                      N      Seattle
## 3                        NULL        T                      N      Arizona
## 4                        NULL        T                      N       Dallas
## 5                        NULL        T                      N      Seattle
## 6          www.livenation.com        T                      N  Los Angeles
##   web_session_cookie_val gndr_cd age_yr income_amt edu_val
## 1   7dfa56dd7d5956b17587    <NA>   <NA>       <NA>    <NA>
## 2   4f9e6fc637eaf7b736c2    <NA>   <NA>       <NA>    <NA>
## 3   6c2545703bd527a7144d    <NA>   <NA>       <NA>    <NA>
## 4   527d6b1eaffc69ddd882    <NA>   <NA>       <NA>    <NA>
## 5   8bd62c394a35213bdf52    <NA>   <NA>       <NA>    <NA>
## 6   3b3a628f83135acd0676    <NA>   <NA>       <NA>    <NA>
##   edu_1st_indv_val edu_2nd_indv_val adults_in_hh_num married_ind
## 1             <NA>             <NA>             <NA>        <NA>
## 2             <NA>             <NA>             <NA>        <NA>
## 3             <NA>             <NA>             <NA>        <NA>
## 4             <NA>             <NA>             <NA>        <NA>
## 5             <NA>             <NA>             <NA>        <NA>
## 6             <NA>             <NA>             <NA>        <NA>
##   child_present_ind home_owner_ind occpn_val occpn_1st_val occpn_2nd_val
## 1              <NA>           <NA>      <NA>          <NA>          <NA>
## 2              <NA>           <NA>      <NA>          <NA>          <NA>
## 3              <NA>           <NA>      <NA>          <NA>          <NA>
## 4              <NA>           <NA>      <NA>          <NA>          <NA>
## 5              <NA>           <NA>      <NA>          <NA>          <NA>
## 6              <NA>           <NA>      <NA>          <NA>          <NA>
##   dist_to_ven
## 1          NA
## 2          59
## 3          NA
## 4          NA
## 5          NA
## 6          NA

names()

names()

# View column names of sales
names(sales)
##  [1] "X"                      "event_id"              
##  [3] "primary_act_id"         "secondary_act_id"      
##  [5] "purch_party_lkup_id"    "event_name"            
##  [7] "primary_act_name"       "secondary_act_name"    
##  [9] "major_cat_name"         "minor_cat_name"        
## [11] "la_event_type_cat"      "event_disp_name"       
## [13] "ticket_text"            "tickets_purchased_qty" 
## [15] "trans_face_val_amt"     "delivery_type_cd"      
## [17] "event_date_time"        "event_dt"              
## [19] "presale_dt"             "onsale_dt"             
## [21] "sales_ord_create_dttm"  "sales_ord_tran_dt"     
## [23] "print_dt"               "timezn_nm"             
## [25] "venue_city"             "venue_state"           
## [27] "venue_postal_cd_sgmt_1" "sales_platform_cd"     
## [29] "print_flg"              "la_valid_tkt_event_flg"
## [31] "fin_mkt_nm"             "web_session_cookie_val"
## [33] "gndr_cd"                "age_yr"                
## [35] "income_amt"             "edu_val"               
## [37] "edu_1st_indv_val"       "edu_2nd_indv_val"      
## [39] "adults_in_hh_num"       "married_ind"           
## [41] "child_present_ind"      "home_owner_ind"        
## [43] "occpn_val"              "occpn_1st_val"         
## [45] "occpn_2nd_val"          "dist_to_ven"

NB

  • rows appear to represent individual purchases
  • columns contain different pieces of information about each purchase.
  • each row represents an observation
  • each column a variable, or piece of information about that observation.

Ex3: Summarizing the data

str()

# Look at structure of sales
str(sales)
## 'data.frame':    5000 obs. of  46 variables:
##  $ X                     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ event_id              : chr  "abcaf1adb99a935fc661" "6c56d7f08c95f2aa453c" "c7ab4524a121f9d687d2" "394cb493f893be9b9ed1" ...
##  $ primary_act_id        : chr  "43f0436b905bfa7c2eec" "1a3e9aecd0617706a794" "4b677c3f5bec71eec8d1" "b1ccea01ad6ef8522796" ...
##  $ secondary_act_id      : chr  "b85143bf51323b72e53c" "f53529c5679ea6ca5a48" "b85143bf51323b72e53c" "b85143bf51323b72e53c" ...
##  $ purch_party_lkup_id   : chr  "7dfa56dd7d5956b17587" "4f9e6fc637eaf7b736c2" "6c2545703bd527a7144d" "527d6b1eaffc69ddd882" ...
##  $ event_name            : chr  "Xfinity Center Mansfield Premier Parking: Florida Georgia Line" "Gorge Camping - dave matthews band - sept 3-7" "Dodge Theatre Adams Street Parking - benise" "Gexa Energy Pavilion Vip Parking : kid rock with sheryl crow" ...
##  $ primary_act_name      : chr  "XFINITY Center Mansfield Premier Parking" "Gorge Camping" "Parking Event" "Gexa Energy Pavilion VIP Parking" ...
##  $ secondary_act_name    : chr  "NULL" "Dave Matthews Band" "NULL" "NULL" ...
##  $ major_cat_name        : chr  "MISC" "MISC" "MISC" "MISC" ...
##  $ minor_cat_name        : chr  "PARKING" "CAMPING" "PARKING" "PARKING" ...
##  $ la_event_type_cat     : chr  "PARKING" "INVALID" "PARKING" "PARKING" ...
##  $ event_disp_name       : chr  "Xfinity Center Mansfield Premier Parking: Florida Georgia Line" "Gorge Camping - dave matthews band - sept 3-7" "Dodge Theatre Adams Street Parking - benise" "Gexa Energy Pavilion Vip Parking : kid rock with sheryl crow" ...
##  $ ticket_text           : chr  "   THIS TICKET IS VALID        FOR PARKING ONLY         GOOD THIS DAY ONLY       PREMIER PARKING PASS    XFINIT"| __truncated__ "%OVERNIGHT C A M P I N G%* * * * * *%GORGE CAMPGROUND%* GOOD THIS DATE ONLY *%SEP 3 - 6, 2009" "ADAMS STREET GARAGE%PARKING FOR 4/21/06 ONLY%DODGE THEATRE PARKING PASS%ENTRANCE ON ADAMS STREET%BENISE%GARAGE OPENS AT 6:00PM" "   THIS TICKET IS VALID        FOR PARKING ONLY      GOOD FOR THIS DATE ONLY       VIP PARKING PASS        GEXA"| __truncated__ ...
##  $ tickets_purchased_qty : int  1 1 1 1 1 2 1 1 1 1 ...
##  $ trans_face_val_amt    : num  45 75 5 20 20 10 30 28 20 25 ...
##  $ delivery_type_cd      : chr  "eTicket" "TicketFast" "TicketFast" "Mail" ...
##  $ event_date_time       : chr  "2015-09-12 23:30:00" "2009-09-05 01:00:00" "2006-04-22 01:30:00" "2011-09-03 00:00:00" ...
##  $ event_dt              : chr  "2015-09-12" "2009-09-04" "2006-04-21" "2011-09-02" ...
##  $ presale_dt            : chr  "NULL" "NULL" "NULL" "NULL" ...
##  $ onsale_dt             : chr  "2015-05-15" "2009-03-13" "2006-02-25" "2011-04-22" ...
##  $ sales_ord_create_dttm : chr  "2015-09-11 18:17:45" "2009-07-06 00:00:00" "2006-04-05 00:00:00" "2011-07-01 17:38:50" ...
##  $ sales_ord_tran_dt     : chr  "2015-09-11" "2009-07-05" "2006-04-05" "2011-07-01" ...
##  $ print_dt              : chr  "2015-09-12" "2009-09-01" "2006-04-05" "2011-07-06" ...
##  $ timezn_nm             : chr  "EST" "PST" "MST" "CST" ...
##  $ venue_city            : chr  "MANSFIELD" "QUINCY" "PHOENIX" "DALLAS" ...
##  $ venue_state           : chr  "MASSACHUSETTS" "WASHINGTON" "ARIZONA" "TEXAS" ...
##  $ venue_postal_cd_sgmt_1: chr  "02048" "98848" "85003" "75210" ...
##  $ sales_platform_cd     : chr  "www.concerts.livenation.com" "NULL" "NULL" "NULL" ...
##  $ print_flg             : chr  "T " "T " "T " "T " ...
##  $ la_valid_tkt_event_flg: chr  "N " "N " "N " "N " ...
##  $ fin_mkt_nm            : chr  "Boston" "Seattle" "Arizona" "Dallas" ...
##  $ web_session_cookie_val: chr  "7dfa56dd7d5956b17587" "4f9e6fc637eaf7b736c2" "6c2545703bd527a7144d" "527d6b1eaffc69ddd882" ...
##  $ gndr_cd               : chr  NA NA NA NA ...
##  $ age_yr                : chr  NA NA NA NA ...
##  $ income_amt            : chr  NA NA NA NA ...
##  $ edu_val               : chr  NA NA NA NA ...
##  $ edu_1st_indv_val      : chr  NA NA NA NA ...
##  $ edu_2nd_indv_val      : chr  NA NA NA NA ...
##  $ adults_in_hh_num      : chr  NA NA NA NA ...
##  $ married_ind           : chr  NA NA NA NA ...
##  $ child_present_ind     : chr  NA NA NA NA ...
##  $ home_owner_ind        : chr  NA NA NA NA ...
##  $ occpn_val             : chr  NA NA NA NA ...
##  $ occpn_1st_val         : chr  NA NA NA NA ...
##  $ occpn_2nd_val         : chr  NA NA NA NA ...
##  $ dist_to_ven           : int  NA 59 NA NA NA NA NA NA NA NA ...

summary()

summary()

# View a summary of sales
summary(sales)
##        X          event_id         primary_act_id     secondary_act_id  
##  Min.   :   1   Length:5000        Length:5000        Length:5000       
##  1st Qu.:1251   Class :character   Class :character   Class :character  
##  Median :2500   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2500                                                           
##  3rd Qu.:3750                                                           
##  Max.   :5000                                                           
##                                                                         
##  purch_party_lkup_id  event_name        primary_act_name  
##  Length:5000         Length:5000        Length:5000       
##  Class :character    Class :character   Class :character  
##  Mode  :character    Mode  :character   Mode  :character  
##                                                           
##                                                           
##                                                           
##                                                           
##  secondary_act_name major_cat_name     minor_cat_name    
##  Length:5000        Length:5000        Length:5000       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  la_event_type_cat  event_disp_name    ticket_text       
##  Length:5000        Length:5000        Length:5000       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  tickets_purchased_qty trans_face_val_amt delivery_type_cd  
##  Min.   :1.000         Min.   :   1.00    Length:5000       
##  1st Qu.:1.000         1st Qu.:  20.00    Class :character  
##  Median :1.000         Median :  30.00    Mode  :character  
##  Mean   :1.639         Mean   :  77.08                      
##  3rd Qu.:2.000         3rd Qu.:  85.00                      
##  Max.   :8.000         Max.   :1520.88                      
##                                                             
##  event_date_time      event_dt          presale_dt       
##  Length:5000        Length:5000        Length:5000       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   onsale_dt         sales_ord_create_dttm sales_ord_tran_dt 
##  Length:5000        Length:5000           Length:5000       
##  Class :character   Class :character      Class :character  
##  Mode  :character   Mode  :character      Mode  :character  
##                                                             
##                                                             
##                                                             
##                                                             
##    print_dt          timezn_nm          venue_city       
##  Length:5000        Length:5000        Length:5000       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  venue_state        venue_postal_cd_sgmt_1 sales_platform_cd 
##  Length:5000        Length:5000            Length:5000       
##  Class :character   Class :character       Class :character  
##  Mode  :character   Mode  :character       Mode  :character  
##                                                              
##                                                              
##                                                              
##                                                              
##   print_flg         la_valid_tkt_event_flg  fin_mkt_nm       
##  Length:5000        Length:5000            Length:5000       
##  Class :character   Class :character       Class :character  
##  Mode  :character   Mode  :character       Mode  :character  
##                                                              
##                                                              
##                                                              
##                                                              
##  web_session_cookie_val   gndr_cd             age_yr         
##  Length:5000            Length:5000        Length:5000       
##  Class :character       Class :character   Class :character  
##  Mode  :character       Mode  :character   Mode  :character  
##                                                              
##                                                              
##                                                              
##                                                              
##   income_amt          edu_val          edu_1st_indv_val  
##  Length:5000        Length:5000        Length:5000       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  edu_2nd_indv_val   adults_in_hh_num   married_ind       
##  Length:5000        Length:5000        Length:5000       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  child_present_ind  home_owner_ind      occpn_val        
##  Length:5000        Length:5000        Length:5000       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  occpn_1st_val      occpn_2nd_val       dist_to_ven    
##  Length:5000        Length:5000        Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:  12.0  
##  Mode  :character   Mode  :character   Median :  26.0  
##                                        Mean   : 158.2  
##                                        3rd Qu.:  77.5  
##                                        Max.   :2548.0  
##                                        NA's   :4677

glimpse()

# Load dplyr
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Get a glimpse of sales
glimpse(sales)
## Observations: 5,000
## Variables: 46
## $ X                      <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ...
## $ event_id               <chr> "abcaf1adb99a935fc661", "6c56d7f08c95f2...
## $ primary_act_id         <chr> "43f0436b905bfa7c2eec", "1a3e9aecd06177...
## $ secondary_act_id       <chr> "b85143bf51323b72e53c", "f53529c5679ea6...
## $ purch_party_lkup_id    <chr> "7dfa56dd7d5956b17587", "4f9e6fc637eaf7...
## $ event_name             <chr> "Xfinity Center Mansfield Premier Parki...
## $ primary_act_name       <chr> "XFINITY Center Mansfield Premier Parki...
## $ secondary_act_name     <chr> "NULL", "Dave Matthews Band", "NULL", "...
## $ major_cat_name         <chr> "MISC", "MISC", "MISC", "MISC", "MISC",...
## $ minor_cat_name         <chr> "PARKING", "CAMPING", "PARKING", "PARKI...
## $ la_event_type_cat      <chr> "PARKING", "INVALID", "PARKING", "PARKI...
## $ event_disp_name        <chr> "Xfinity Center Mansfield Premier Parki...
## $ ticket_text            <chr> "   THIS TICKET IS VALID        FOR PAR...
## $ tickets_purchased_qty  <int> 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 4, ...
## $ trans_face_val_amt     <dbl> 45, 75, 5, 20, 20, 10, 30, 28, 20, 25, ...
## $ delivery_type_cd       <chr> "eTicket", "TicketFast", "TicketFast", ...
## $ event_date_time        <chr> "2015-09-12 23:30:00", "2009-09-05 01:0...
## $ event_dt               <chr> "2015-09-12", "2009-09-04", "2006-04-21...
## $ presale_dt             <chr> "NULL", "NULL", "NULL", "NULL", "2005-0...
## $ onsale_dt              <chr> "2015-05-15", "2009-03-13", "2006-02-25...
## $ sales_ord_create_dttm  <chr> "2015-09-11 18:17:45", "2009-07-06 00:0...
## $ sales_ord_tran_dt      <chr> "2015-09-11", "2009-07-05", "2006-04-05...
## $ print_dt               <chr> "2015-09-12", "2009-09-01", "2006-04-05...
## $ timezn_nm              <chr> "EST", "PST", "MST", "CST", "PST", "PST...
## $ venue_city             <chr> "MANSFIELD", "QUINCY", "PHOENIX", "DALL...
## $ venue_state            <chr> "MASSACHUSETTS", "WASHINGTON", "ARIZONA...
## $ venue_postal_cd_sgmt_1 <chr> "02048", "98848", "85003", "75210", "98...
## $ sales_platform_cd      <chr> "www.concerts.livenation.com", "NULL", ...
## $ print_flg              <chr> "T ", "T ", "T ", "T ", "T ", "T ", "T ...
## $ la_valid_tkt_event_flg <chr> "N ", "N ", "N ", "N ", "N ", "N ", "N ...
## $ fin_mkt_nm             <chr> "Boston", "Seattle", "Arizona", "Dallas...
## $ web_session_cookie_val <chr> "7dfa56dd7d5956b17587", "4f9e6fc637eaf7...
## $ gndr_cd                <chr> NA, NA, NA, NA, NA, NA, "M", NA, NA, NA...
## $ age_yr                 <chr> NA, NA, NA, NA, NA, NA, "28", NA, NA, N...
## $ income_amt             <chr> NA, NA, NA, NA, NA, NA, "112500", NA, N...
## $ edu_val                <chr> NA, NA, NA, NA, NA, NA, "High School", ...
## $ edu_1st_indv_val       <chr> NA, NA, NA, NA, NA, NA, "High School", ...
## $ edu_2nd_indv_val       <chr> NA, NA, NA, NA, NA, NA, "NULL", NA, NA,...
## $ adults_in_hh_num       <chr> NA, NA, NA, NA, NA, NA, "4", NA, NA, NA...
## $ married_ind            <chr> NA, NA, NA, NA, NA, NA, "0", NA, NA, NA...
## $ child_present_ind      <chr> NA, NA, NA, NA, NA, NA, "1", NA, NA, NA...
## $ home_owner_ind         <chr> NA, NA, NA, NA, NA, NA, "0", NA, NA, NA...
## $ occpn_val              <chr> NA, NA, NA, NA, NA, NA, "NULL", NA, NA,...
## $ occpn_1st_val          <chr> NA, NA, NA, NA, NA, NA, "Craftsman Blue...
## $ occpn_2nd_val          <chr> NA, NA, NA, NA, NA, NA, "NULL", NA, NA,...
## $ dist_to_ven            <int> NA, 59, NA, NA, NA, NA, NA, NA, NA, NA,...

Ex4: Removing redundant info

first column = X = counting = duplication of the row numbers

nrow() ncol()

nrow(sales)
## [1] 5000
ncol(sales)
## [1] 46

Remove the first column of sales

method 1: subset

# ```{r ... results='hide'}
# subset a data frame
sales[,4]
sales2 <- sales[,2:ncol(sales)]

method 2: negative indices

# ```{r ... results='hide'}
# remove rows and columns using negative indices
sales[-(1:5), ] # Omit first 5 rows of sales
sales[, -4]     # Omit fourth column of sales
sales2 <- sales[,-1] # Omit first row of sales

Verify

# verify col X  removed
str(sales[,1:5])
## 'data.frame':    5000 obs. of  5 variables:
##  $ X                  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ event_id           : chr  "abcaf1adb99a935fc661" "6c56d7f08c95f2aa453c" "c7ab4524a121f9d687d2" "394cb493f893be9b9ed1" ...
##  $ primary_act_id     : chr  "43f0436b905bfa7c2eec" "1a3e9aecd0617706a794" "4b677c3f5bec71eec8d1" "b1ccea01ad6ef8522796" ...
##  $ secondary_act_id   : chr  "b85143bf51323b72e53c" "f53529c5679ea6ca5a48" "b85143bf51323b72e53c" "b85143bf51323b72e53c" ...
##  $ purch_party_lkup_id: chr  "7dfa56dd7d5956b17587" "4f9e6fc637eaf7b736c2" "6c2545703bd527a7144d" "527d6b1eaffc69ddd882" ...
str(sales2[,1:5])
## 'data.frame':    5000 obs. of  5 variables:
##  $ event_id           : chr  "abcaf1adb99a935fc661" "6c56d7f08c95f2aa453c" "c7ab4524a121f9d687d2" "394cb493f893be9b9ed1" ...
##  $ primary_act_id     : chr  "43f0436b905bfa7c2eec" "1a3e9aecd0617706a794" "4b677c3f5bec71eec8d1" "b1ccea01ad6ef8522796" ...
##  $ secondary_act_id   : chr  "b85143bf51323b72e53c" "f53529c5679ea6ca5a48" "b85143bf51323b72e53c" "b85143bf51323b72e53c" ...
##  $ purch_party_lkup_id: chr  "7dfa56dd7d5956b17587" "4f9e6fc637eaf7b736c2" "6c2545703bd527a7144d" "527d6b1eaffc69ddd882" ...
##  $ event_name         : chr  "Xfinity Center Mansfield Premier Parking: Florida Georgia Line" "Gorge Camping - dave matthews band - sept 3-7" "Dodge Theatre Adams Street Parking - benise" "Gexa Energy Pavilion Vip Parking : kid rock with sheryl crow" ...

Ex5: Information not worth keeping

  • first four columns contain internal codes representing particular events.
  • last fifteen columns - too many missing values.
  1. create a vector containing the column indices you want to keep
# Define a vector of column indices: keep
# keep everything besides the first 4 and last 15
keep <- 5:(ncol(sales2) - 15)
  1. subset the data based on that vector using single bracket subsetting.
# Subset sales2 using keep: sales3
sales3 <- sales2[,keep]

Ex6: Separating columns

separate a column into two: one for date one for time

event_date_time

# Load tidyr
library(tidyr)

# look at event_date_time col
head(sales3$event_date_time)
## [1] "2015-09-12 23:30:00" "2009-09-05 01:00:00" "2006-04-22 01:30:00"
## [4] "2011-09-03 00:00:00" "2005-07-31 01:00:00" "2012-07-22 02:00:00"
#date and time are separated by a space
# => sep = " "

# Split event_date_time: sales4
sales4 <- separate(sales3, event_date_time,
                   c("event_dt", "event_time"), sep = " ")
                  # required c()
                  # required ""

## separate() should have four arguments:
## data frame
## column name without quotes
## character vector with names of new columns (in quotes)
## character string showing where to separate the original column (in quotes)

sales_ord_create_dttm


# what sep needed?
head(sales4$sales_ord_create_dttm)
## [1] "2015-09-11 18:17:45" "2009-07-06 00:00:00" "2006-04-05 00:00:00"
## [4] "2011-07-01 17:38:50" "2005-06-18 00:00:00" "2012-07-21 17:20:18"

# Split sales_ord_create_dttm: sales5
sales5 <- separate(sales4, sales_ord_create_dttm,
                   c("ord_create_dt", "ord_create_time"), sep = " ")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 4 rows
## [2516, 3863, 4082, 4183].

Ex7: warnings?

“Too few values at 4 locations” locations (i.e. rows) given in the warning: 2516, 3863, 4082, and 4183

# Define an issues vector
issues <- c(2516, 3863, 4082, 4183)

# Print values of sales_ord_create_dttm at these indices
sales3$sales_ord_create_dttm[issues]
## [1] "NULL" "NULL" "NULL" "NULL"
# phew.. warning was just because of four missing values

let’s see a well-behaved value

# Print a well-behaved value of sales_ord_create_dttm
sales3$sales_ord_create_dttm[2517]
## [1] "2013-08-04 23:07:19"

Ex8: Identifying dates

all of the date columns in this dataset have the substring “dt” in their name

library(stringr)

# Find columns of sales5 containing "dt": date_cols
date_cols <- str_detect(names(sales5), "dt")


library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
# function names combine the letters y, m, d, h, m, and s

# Coerce date columns into Date objects
# lapply(my_data_frame[, cols], function_name)
sales5[, date_cols] <- lapply(sales5[,date_cols], ymd)
## Warning: 2892 failed to parse.
## Warning: 101 failed to parse.
## Warning: 4 failed to parse.
## Warning: 424 failed to parse.

Ex9: warning: failure to parse

Warning message: 2892 failed to parse. Warning message: 101 failed to parse. Warning message: 4 failed to parse. Warning message: 424 failed to parse.


# Find date columns (don't change)
date_cols <- str_detect(names(sales5), "dt")

# Create logical vectors indicating missing values (don't change)
missing <- lapply(sales5[, date_cols], is.na)

## Each vector indicates the presence (or absence) 
## of missing values in the corresponding column of sales5

## CHECK if:
## number of missing values in each column 
## = 
## number of rows that failed to parse

# Create a numerical vector that counts missing values: num_missing
num_missing <- sapply(missing, sum)
num_missing
##          event_dt        presale_dt         onsale_dt     ord_create_dt 
##                 0              2892               101                 4 
## sales_ord_tran_dt          print_dt 
##                 0               424

Ex10: Combining columns

unite separated by a comma and a space For example, “PORTLAND” “MAINE” should become “PORTLAND, MAINE”.

# unite() should have five arguments:
## data frame
## The name of the new column to create (no quotes)
## The name of the first column to merge (no quotes)
## The name of the second column to merge (no quotes)
## A character string containing the separator (with quotes)


# Combine the venue_city and venue_state columns
sales6 <- unite(sales5, venue_city_state, venue_city, venue_state, 
                sep = ", ") # the word "sep" was required

# View the head of sales6
head(sales6)
##                                                       event_name
## 1 Xfinity Center Mansfield Premier Parking: Florida Georgia Line
## 2                  Gorge Camping - dave matthews band - sept 3-7
## 3                    Dodge Theatre Adams Street Parking - benise
## 4   Gexa Energy Pavilion Vip Parking : kid rock with sheryl crow
## 5                                  Premier Parking - motley crue
## 6                                      Fast Lane Access: Journey
##                           primary_act_name secondary_act_name
## 1 XFINITY Center Mansfield Premier Parking               NULL
## 2                            Gorge Camping Dave Matthews Band
## 3                            Parking Event               NULL
## 4         Gexa Energy Pavilion VIP Parking               NULL
## 5 White River Amphitheatre Premier Parking               NULL
## 6                         Fast Lane Access            Journey
##   major_cat_name         minor_cat_name la_event_type_cat
## 1           MISC                PARKING           PARKING
## 2           MISC                CAMPING           INVALID
## 3           MISC                PARKING           PARKING
## 4           MISC                PARKING           PARKING
## 5           MISC                PARKING           PARKING
## 6           MISC SPECIAL ENTRY (UPSELL)            UPSELL
##                                                  event_disp_name
## 1 Xfinity Center Mansfield Premier Parking: Florida Georgia Line
## 2                  Gorge Camping - dave matthews band - sept 3-7
## 3                    Dodge Theatre Adams Street Parking - benise
## 4   Gexa Energy Pavilion Vip Parking : kid rock with sheryl crow
## 5                                  Premier Parking - motley crue
## 6                                      Fast Lane Access: Journey
##                                                                                                                                                    ticket_text
## 1    THIS TICKET IS VALID        FOR PARKING ONLY         GOOD THIS DAY ONLY       PREMIER PARKING PASS    XFINITY CENTER,LOTS 4 PM  SAT SEP 12 2015 7:30 PM  
## 2                                                                %OVERNIGHT C A M P I N G%* * * * * *%GORGE CAMPGROUND%* GOOD THIS DATE ONLY *%SEP 3 - 6, 2009
## 3                               ADAMS STREET GARAGE%PARKING FOR 4/21/06 ONLY%DODGE THEATRE PARKING PASS%ENTRANCE ON ADAMS STREET%BENISE%GARAGE OPENS AT 6:00PM
## 4    THIS TICKET IS VALID        FOR PARKING ONLY      GOOD FOR THIS DATE ONLY       VIP PARKING PASS        GEXA ENERGY PAVILION    FRI SEP 02 2011 7:00 PM  
## 5                              THIS TICKET IS VALID%FOR PARKING ONLY%GOOD THIS DATE ONLY%PREMIER PARKING PASS%WHITE RIVER AMPHITHEATRE%SAT JUL 30, 2005 6:00PM
## 6         FAST LANE                  JOURNEY               FAST LANE EVENT         THIS IS NOT A TICKET    SAN MANUEL AMPHITHEATER   SAT JUL 21 2012 7:00 PM  
##   tickets_purchased_qty trans_face_val_amt delivery_type_cd   event_dt
## 1                     1                 45          eTicket 2015-09-12
## 2                     1                 75       TicketFast 2009-09-05
## 3                     1                  5       TicketFast 2006-04-22
## 4                     1                 20             Mail 2011-09-03
## 5                     1                 20             Mail 2005-07-31
## 6                     2                 10       TicketFast 2012-07-22
##   event_time presale_dt  onsale_dt ord_create_dt ord_create_time
## 1   23:30:00       <NA> 2015-05-15    2015-09-11        18:17:45
## 2   01:00:00       <NA> 2009-03-13    2009-07-06        00:00:00
## 3   01:30:00       <NA> 2006-02-25    2006-04-05        00:00:00
## 4   00:00:00       <NA> 2011-04-22    2011-07-01        17:38:50
## 5   01:00:00 2005-03-02 2005-03-04    2005-06-18        00:00:00
## 6   02:00:00       <NA> 2012-04-11    2012-07-21        17:20:18
##   sales_ord_tran_dt   print_dt timezn_nm           venue_city_state
## 1        2015-09-11 2015-09-12       EST   MANSFIELD, MASSACHUSETTS
## 2        2009-07-05 2009-09-01       PST         QUINCY, WASHINGTON
## 3        2006-04-05 2006-04-05       MST           PHOENIX, ARIZONA
## 4        2011-07-01 2011-07-06       CST              DALLAS, TEXAS
## 5        2005-06-18 2005-06-28       PST         AUBURN, WASHINGTON
## 6        2012-07-21 2012-07-21       PST SAN BERNARDINO, CALIFORNIA
##   venue_postal_cd_sgmt_1           sales_platform_cd print_flg
## 1                  02048 www.concerts.livenation.com        T 
## 2                  98848                        NULL        T 
## 3                  85003                        NULL        T 
## 4                  75210                        NULL        T 
## 5                  98092                        NULL        T 
## 6                  92407          www.livenation.com        T 
##   la_valid_tkt_event_flg  fin_mkt_nm
## 1                     N       Boston
## 2                     N      Seattle
## 3                     N      Arizona
## 4                     N       Dallas
## 5                     N      Seattle
## 6                     N  Los Angeles
head(sales6$venue_city_state)
## [1] "MANSFIELD, MASSACHUSETTS"   "QUINCY, WASHINGTON"        
## [3] "PHOENIX, ARIZONA"           "DALLAS, TEXAS"             
## [5] "AUBURN, WASHINGTON"         "SAN BERNARDINO, CALIFORNIA"

MBTA Ridership

MBTA = Massachusetts Bay Transportation Authority = “the T” for short * manage: * America’s oldest subway * Greater Boston’s * commuter rail * ferry * bus systems

TASK: average ridership through time => need to clean the data

Ex1: readxl

Import mbta.xlsx and skip first row


# url_mbta <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1294/datasets/mbta.xlsx"
# download.file(url_mbta, "mbta.xlsx") # downloaded file did NOT open

# !!QUICK FIX!!
# went to link
# opened file
# saved file in Temp folder

# Load Hadley Wickham's readxl package
library(readxl)

# Import mbta.xlsx and skip first row: mbta
# we know: first row is a title
mbta <- read_excel("C:/Temp/mbta.xlsx", skip = 1)
# ```{r ... eval=FALSE} as the following returns the error message below, which indicates "file not found".

mbta <- read_excel("http://s3.amazonaws.com/assets.datacamp.com/production/course_1294/datasets/mbta.xlsx", skip = 1)

# Error in read_fun(path = path, sheet = sheet, limits = limits, shim = shim,  : 
#  Evaluation error: zip file 'http://s3.amazonaws.com/assets.datacamp.com/production/course_1294/datasets/mbta.xlsx' cannot be opened

Ex2: Examining the data

Do you notice anything strange about how the rows and columns are organized?


str(mbta)
## Classes 'tbl_df', 'tbl' and 'data.frame':    11 obs. of  60 variables:
##  $ X__1   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ mode   : chr  "All Modes by Qtr" "Boat" "Bus" "Commuter Rail" ...
##  $ 2007-01: chr  "NA" "4" "335.81900000000002" "142.19999999999999" ...
##  $ 2007-02: chr  "NA" "3.6" "338.67500000000001" "138.5" ...
##  $ 2007-03: num  1188 40 340 138 459 ...
##  $ 2007-04: chr  "NA" "4.3" "352.16199999999998" "139.5" ...
##  $ 2007-05: chr  "NA" "4.9000000000000004" "354.36700000000002" "139" ...
##  $ 2007-06: num  1246 5.8 350.5 143 477 ...
##  $ 2007-07: chr  "NA" "6.5209999999999999" "357.51900000000001" "142.39099999999999" ...
##  $ 2007-08: chr  "NA" "6.5720000000000001" "355.47899999999998" "142.364" ...
##  $ 2007-09: num  1256.57 5.47 372.6 143.05 499.57 ...
##  $ 2007-10: chr  "NA" "5.1449999999999996" "368.84699999999998" "146.542" ...
##  $ 2007-11: chr  "NA" "3.7629999999999999" "330.82600000000002" "145.089" ...
##  $ 2007-12: num  1216.89 2.98 312.92 141.59 448.27 ...
##  $ 2008-01: chr  "NA" "3.1749999999999998" "340.32400000000001" "142.14500000000001" ...
##  $ 2008-02: chr  "NA" "3.1110000000000002" "352.90499999999997" "142.607" ...
##  $ 2008-03: num  1253.52 3.51 361.15 137.45 494.05 ...
##  $ 2008-04: chr  "NA" "4.1639999999999997" "368.18900000000002" "140.38900000000001" ...
##  $ 2008-05: chr  "NA" "4.0149999999999997" "363.90300000000002" "142.58500000000001" ...
##  $ 2008-06: num  1314.82 5.19 362.96 142.06 518.35 ...
##  $ 2008-07: chr  "NA" "6.016" "370.92099999999999" "145.73099999999999" ...
##  $ 2008-08: chr  "NA" "5.8" "361.05700000000002" "144.565" ...
##  $ 2008-09: num  1307.04 4.59 389.54 141.91 517.32 ...
##  $ 2008-10: chr  "NA" "4.2850000000000001" "357.97399999999999" "151.95699999999999" ...
##  $ 2008-11: chr  "NA" "3.488" "345.423" "152.952" ...
##  $ 2008-12: num  1232.65 3.01 325.77 140.81 446.74 ...
##  $ 2009-01: chr  "NA" "3.0139999999999998" "338.53199999999998" "141.44800000000001" ...
##  $ 2009-02: chr  "NA" "3.1960000000000002" "360.41199999999998" "143.529" ...
##  $ 2009-03: num  1209.79 3.33 353.69 142.89 467.22 ...
##  $ 2009-04: chr  "NA" "4.0490000000000004" "359.38" "142.34" ...
##  $ 2009-05: chr  "NA" "4.1189999999999998" "354.75" "144.22499999999999" ...
##  $ 2009-06: num  1233.1 4.9 347.9 142 473.1 ...
##  $ 2009-07: chr  "NA" "6.444" "339.47699999999998" "137.691" ...
##  $ 2009-08: chr  "NA" "5.9029999999999996" "332.661" "139.15799999999999" ...
##  $ 2009-09: num  1230.5 4.7 374.3 139.1 500.4 ...
##  $ 2009-10: chr  "NA" "4.2119999999999997" "385.86799999999999" "137.10400000000001" ...
##  $ 2009-11: chr  "NA" "3.5760000000000001" "366.98" "129.34299999999999" ...
##  $ 2009-12: num  1207.85 3.11 332.39 126.07 440.93 ...
##  $ 2010-01: chr  "NA" "3.2069999999999999" "362.226" "130.91" ...
##  $ 2010-02: chr  "NA" "3.1949999999999998" "361.13799999999998" "131.91800000000001" ...
##  $ 2010-03: num  1208.86 3.48 373.44 131.25 483.4 ...
##  $ 2010-04: chr  "NA" "4.452" "378.61099999999999" "131.72200000000001" ...
##  $ 2010-05: chr  "NA" "4.415" "380.17099999999999" "128.80000000000001" ...
##  $ 2010-06: num  1244.41 5.41 363.27 129.14 490.26 ...
##  $ 2010-07: chr  "NA" "6.5129999999999999" "353.04" "122.935" ...
##  $ 2010-08: chr  "NA" "6.2690000000000001" "343.68799999999999" "129.732" ...
##  $ 2010-09: num  1225.5 4.7 381.6 132.9 521.1 ...
##  $ 2010-10: chr  "NA" "4.4020000000000001" "384.98700000000002" "131.03299999999999" ...
##  $ 2010-11: chr  "NA" "3.7309999999999999" "367.95499999999998" "130.88900000000001" ...
##  $ 2010-12: num  1216.26 3.16 326.34 121.42 450.43 ...
##  $ 2011-01: chr  "NA" "3.14" "334.95800000000003" "128.39599999999999" ...
##  $ 2011-02: chr  "NA" "3.2839999999999998" "346.23399999999998" "125.46299999999999" ...
##  $ 2011-03: num  1223.45 3.67 380.4 134.37 516.73 ...
##  $ 2011-04: chr  "NA" "4.2510000000000003" "380.44600000000003" "134.16900000000001" ...
##  $ 2011-05: chr  "NA" "4.431" "385.28899999999999" "136.13999999999999" ...
##  $ 2011-06: num  1302.41 5.47 376.32 135.58 529.53 ...
##  $ 2011-07: chr  "NA" "6.5810000000000004" "361.58499999999998" "132.41" ...
##  $ 2011-08: chr  "NA" "6.7329999999999997" "353.79300000000001" "130.61600000000001" ...
##  $ 2011-09: num  1291 5 388 137 550 ...
##  $ 2011-10: chr  "NA" "4.484" "398.45600000000002" "128.72" ...

head(mbta, 6)
## # A tibble: 6 x 60
##    X__1 mode   `2007-01` `2007-02` `2007-03` `2007-04` `2007-05` `2007-06`
##   <dbl> <chr>  <chr>     <chr>         <dbl> <chr>     <chr>         <dbl>
## 1     1 All M~ NA        NA            1188. NA        NA           1246. 
## 2     2 Boat   4         3.6             40  4.3       4.900000~       5.8
## 3     3 Bus    335.8190~ 338.6750~      340. 352.1619~ 354.3670~     351. 
## 4     4 Commu~ 142.1999~ 138.5          138. 139.5     139           143  
## 5     5 Heavy~ 435.2939~ 448.2710~      459. 472.2010~ 474.5790~     477. 
## 6     6 Light~ 227.2309~ 240.262        241. 255.5569~ 248.262       246. 
## # ... with 52 more variables: `2007-07` <chr>, `2007-08` <chr>,
## #   `2007-09` <dbl>, `2007-10` <chr>, `2007-11` <chr>, `2007-12` <dbl>,
## #   `2008-01` <chr>, `2008-02` <chr>, `2008-03` <dbl>, `2008-04` <chr>,
## #   `2008-05` <chr>, `2008-06` <dbl>, `2008-07` <chr>, `2008-08` <chr>,
## #   `2008-09` <dbl>, `2008-10` <chr>, `2008-11` <chr>, `2008-12` <dbl>,
## #   `2009-01` <chr>, `2009-02` <chr>, `2009-03` <dbl>, `2009-04` <chr>,
## #   `2009-05` <chr>, `2009-06` <dbl>, `2009-07` <chr>, `2009-08` <chr>,
## #   `2009-09` <dbl>, `2009-10` <chr>, `2009-11` <chr>, `2009-12` <dbl>,
## #   `2010-01` <chr>, `2010-02` <chr>, `2010-03` <dbl>, `2010-04` <chr>,
## #   `2010-05` <chr>, `2010-06` <dbl>, `2010-07` <chr>, `2010-08` <chr>,
## #   `2010-09` <dbl>, `2010-10` <chr>, `2010-11` <chr>, `2010-12` <dbl>,
## #   `2011-01` <chr>, `2011-02` <chr>, `2011-03` <dbl>, `2011-04` <chr>,
## #   `2011-05` <chr>, `2011-06` <dbl>, `2011-07` <chr>, `2011-08` <chr>,
## #   `2011-09` <dbl>, `2011-10` <chr>

summary(mbta)
##       X__1          mode             2007-01            2007-02         
##  Min.   : 1.0   Length:11          Length:11          Length:11         
##  1st Qu.: 3.5   Class :character   Class :character   Class :character  
##  Median : 6.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 6.0                                                           
##  3rd Qu.: 8.5                                                           
##  Max.   :11.0                                                           
##     2007-03           2007-04            2007-05         
##  Min.   :   0.114   Length:11          Length:11         
##  1st Qu.:   9.278   Class :character   Class :character  
##  Median : 137.700   Mode  :character   Mode  :character  
##  Mean   : 330.293                                        
##  3rd Qu.: 399.225                                        
##  Max.   :1204.725                                        
##     2007-06           2007-07            2007-08         
##  Min.   :   0.096   Length:11          Length:11         
##  1st Qu.:   5.700   Class :character   Class :character  
##  Median : 143.000   Mode  :character   Mode  :character  
##  Mean   : 339.846                                        
##  3rd Qu.: 413.788                                        
##  Max.   :1246.129                                        
##     2007-09           2007-10            2007-11         
##  Min.   :  -0.007   Length:11          Length:11         
##  1st Qu.:   5.539   Class :character   Class :character  
##  Median : 143.051   Mode  :character   Mode  :character  
##  Mean   : 352.554                                        
##  3rd Qu.: 436.082                                        
##  Max.   :1310.764                                        
##     2007-12           2008-01            2008-02         
##  Min.   :  -0.060   Length:11          Length:11         
##  1st Qu.:   4.385   Class :character   Class :character  
##  Median : 141.585   Mode  :character   Mode  :character  
##  Mean   : 321.588                                        
##  3rd Qu.: 380.594                                        
##  Max.   :1216.890                                        
##     2008-03           2008-04            2008-05         
##  Min.   :   0.058   Length:11          Length:11         
##  1st Qu.:   5.170   Class :character   Class :character  
##  Median : 137.453   Mode  :character   Mode  :character  
##  Mean   : 345.604                                        
##  3rd Qu.: 427.601                                        
##  Max.   :1274.031                                        
##     2008-06           2008-07            2008-08         
##  Min.   :   0.060   Length:11          Length:11         
##  1st Qu.:   5.742   Class :character   Class :character  
##  Median : 142.057   Mode  :character   Mode  :character  
##  Mean   : 359.667                                        
##  3rd Qu.: 440.656                                        
##  Max.   :1320.728                                        
##     2008-09           2008-10            2008-11         
##  Min.   :   0.021   Length:11          Length:11         
##  1st Qu.:   5.691   Class :character   Class :character  
##  Median : 141.907   Mode  :character   Mode  :character  
##  Mean   : 362.099                                        
##  3rd Qu.: 453.430                                        
##  Max.   :1338.015                                        
##     2008-12           2009-01            2009-02         
##  Min.   :  -0.015   Length:11          Length:11         
##  1st Qu.:   4.689   Class :character   Class :character  
##  Median : 140.810   Mode  :character   Mode  :character  
##  Mean   : 319.882                                        
##  3rd Qu.: 386.255                                        
##  Max.   :1232.655                                        
##     2009-03           2009-04            2009-05         
##  Min.   :  -0.050   Length:11          Length:11         
##  1st Qu.:   5.003   Class :character   Class :character  
##  Median : 142.893   Mode  :character   Mode  :character  
##  Mean   : 330.142                                        
##  3rd Qu.: 410.455                                        
##  Max.   :1210.912                                        
##     2009-06           2009-07            2009-08         
##  Min.   :  -0.079   Length:11          Length:11         
##  1st Qu.:   5.845   Class :character   Class :character  
##  Median : 142.006   Mode  :character   Mode  :character  
##  Mean   : 333.194                                        
##  3rd Qu.: 410.482                                        
##  Max.   :1233.085                                        
##     2009-09           2009-10            2009-11         
##  Min.   :  -0.035   Length:11          Length:11         
##  1st Qu.:   5.693   Class :character   Class :character  
##  Median : 139.087   Mode  :character   Mode  :character  
##  Mean   : 346.687                                        
##  3rd Qu.: 437.332                                        
##  Max.   :1291.564                                        
##     2009-12           2010-01            2010-02         
##  Min.   :  -0.022   Length:11          Length:11         
##  1st Qu.:   4.784   Class :character   Class :character  
##  Median : 126.066   Mode  :character   Mode  :character  
##  Mean   : 312.962                                        
##  3rd Qu.: 386.659                                        
##  Max.   :1207.845                                        
##     2010-03           2010-04            2010-05         
##  Min.   :   0.012   Length:11          Length:11         
##  1st Qu.:   5.274   Class :character   Class :character  
##  Median : 131.252   Mode  :character   Mode  :character  
##  Mean   : 332.726                                        
##  3rd Qu.: 428.420                                        
##  Max.   :1225.556                                        
##     2010-06           2010-07            2010-08         
##  Min.   :   0.008   Length:11          Length:11         
##  1st Qu.:   6.436   Class :character   Class :character  
##  Median : 129.144   Mode  :character   Mode  :character  
##  Mean   : 335.964                                        
##  3rd Qu.: 426.769                                        
##  Max.   :1244.409                                        
##     2010-09           2010-10            2010-11         
##  Min.   :   0.001   Length:11          Length:11         
##  1st Qu.:   5.567   Class :character   Class :character  
##  Median : 132.892   Mode  :character   Mode  :character  
##  Mean   : 346.524                                        
##  3rd Qu.: 451.361                                        
##  Max.   :1293.117                                        
##     2010-12           2011-01            2011-02         
##  Min.   :  -0.004   Length:11          Length:11         
##  1st Qu.:   4.466   Class :character   Class :character  
##  Median : 121.422   Mode  :character   Mode  :character  
##  Mean   : 312.917                                        
##  3rd Qu.: 388.385                                        
##  Max.   :1216.262                                        
##     2011-03          2011-04            2011-05         
##  Min.   :   0.05   Length:11          Length:11         
##  1st Qu.:   6.03   Class :character   Class :character  
##  Median : 134.37   Mode  :character   Mode  :character  
##  Mean   : 345.17                                        
##  3rd Qu.: 448.56                                        
##  Max.   :1286.66                                        
##     2011-06           2011-07            2011-08         
##  Min.   :   0.054   Length:11          Length:11         
##  1st Qu.:   6.926   Class :character   Class :character  
##  Median : 135.581   Mode  :character   Mode  :character  
##  Mean   : 353.331                                        
##  3rd Qu.: 452.923                                        
##  Max.   :1302.414                                        
##     2011-09           2011-10         
##  Min.   :   0.043   Length:11         
##  1st Qu.:   6.660   Class :character  
##  Median : 136.901   Mode  :character  
##  Mean   : 362.555                     
##  3rd Qu.: 469.204                     
##  Max.   :1348.754

to note

observations stored as columns rather than as rows

Ex3: Removing unnecessary rows/cols

address the missing data

Remove rows: “All Modes by Qtr” row = NA values this row should bein a different data frame it is: quarterly average of weekday MBTA ridership this dataset tracks monthly average ridership

Analysis i.e. NOT observations 7th row (“Pct Chg / Yr”) 11th row (“TOTAL”)

# Remove rows 1, 7, and 11 of mbta: mbta2
mbta2 <- mbta[-c(1, 7, 11), ]
# Use negative indices to quickly remove the rows and column you don't need

# alternatively,
keep <- !(mbta$mode %in% c('All Modes by Qtr', 'Pct Chg / Yr', 'TOTAL'))
mbta2 <- mbta[keep,]
# think about which rows you want to keep and use those indices in your subset

1st col = listing the row numbers

# Remove the first column of mbta2: mbta3
mbta3 <- mbta2[,-1]
# Use negative indices to quickly remove the rows and column you don't need. 

Ex4: Observations are stored in columns

# different modes of transportation (commuter rail, bus, subway, ferry, ...) are variables
# providing information about each month's average ridership

# months = observations
# as you go through time, the month changes, but the modes of transport offered by the T do not.

head(mbta3)
## # A tibble: 6 x 59
##   mode    `2007-01`   `2007-02`  `2007-03` `2007-04`  `2007-05`  `2007-06`
##   <chr>   <chr>       <chr>          <dbl> <chr>      <chr>          <dbl>
## 1 Boat    4           3.6            40    4.3        4.9000000~      5.8 
## 2 Bus     335.819000~ 338.67500~    340.   352.16199~ 354.36700~    351.  
## 3 Commut~ 142.199999~ 138.5         138.   139.5      139           143   
## 4 Heavy ~ 435.293999~ 448.27100~    459.   472.20100~ 474.57900~    477.  
## 5 Light ~ 227.230999~ 240.262       241.   255.55699~ 248.262       246.  
## 6 Privat~ 4.77200000~ 4.4169999~      4.57 4.5419999~ 4.7679999~      4.72
## # ... with 52 more variables: `2007-07` <chr>, `2007-08` <chr>,
## #   `2007-09` <dbl>, `2007-10` <chr>, `2007-11` <chr>, `2007-12` <dbl>,
## #   `2008-01` <chr>, `2008-02` <chr>, `2008-03` <dbl>, `2008-04` <chr>,
## #   `2008-05` <chr>, `2008-06` <dbl>, `2008-07` <chr>, `2008-08` <chr>,
## #   `2008-09` <dbl>, `2008-10` <chr>, `2008-11` <chr>, `2008-12` <dbl>,
## #   `2009-01` <chr>, `2009-02` <chr>, `2009-03` <dbl>, `2009-04` <chr>,
## #   `2009-05` <chr>, `2009-06` <dbl>, `2009-07` <chr>, `2009-08` <chr>,
## #   `2009-09` <dbl>, `2009-10` <chr>, `2009-11` <chr>, `2009-12` <dbl>,
## #   `2010-01` <chr>, `2010-02` <chr>, `2010-03` <dbl>, `2010-04` <chr>,
## #   `2010-05` <chr>, `2010-06` <dbl>, `2010-07` <chr>, `2010-08` <chr>,
## #   `2010-09` <dbl>, `2010-10` <chr>, `2010-11` <chr>, `2010-12` <dbl>,
## #   `2011-01` <chr>, `2011-02` <chr>, `2011-03` <dbl>, `2011-04` <chr>,
## #   `2011-05` <chr>, `2011-06` <dbl>, `2011-07` <chr>, `2011-08` <chr>,
## #   `2011-09` <dbl>, `2011-10` <chr>

# want variables in columns 

# Load tidyr
library(tidyr)

# Gather columns of mbta3: mbta4
# into key-value pairs
mbta4 <- gather(mbta3, month, thou_riders, -mode)
head(mbta4) # dataset is long now
## # A tibble: 6 x 3
##   mode          month   thou_riders       
##   <chr>         <chr>   <chr>             
## 1 Boat          2007-01 4                 
## 2 Bus           2007-01 335.81900000000002
## 3 Commuter Rail 2007-01 142.19999999999999
## 4 Heavy Rail    2007-01 435.29399999999998
## 5 Light Rail    2007-01 227.23099999999999
## 6 Private Bus   2007-01 4.7720000000000002

Ex5: Type conversions

# thou_riders = average weekday ridership column = character strings
# Coerce into numeric values
mbta4$thou_riders <- as.numeric(mbta4$thou_riders)

Ex6: Variables are stored in both rows and columns

variables are stored as “keys” in the “mode” column


# tidyr function spread() 
# to make "keys" into columns containing average weekday ridership for the given month and mode of transport
# Spread the contents of mbta4: mbta5
mbta5 <- spread(mbta4, mode, thou_riders)
head(mbta5)
## # A tibble: 6 x 9
##   month    Boat   Bus `Commuter Rail` `Heavy Rail` `Light Rail`
##   <chr>   <dbl> <dbl>           <dbl>        <dbl>        <dbl>
## 1 2007-01   4    336.            142.         435.         227.
## 2 2007-02   3.6  339.            138.         448.         240.
## 3 2007-03  40    340.            138.         459.         241.
## 4 2007-04   4.3  352.            140.         472.         256.
## 5 2007-05   4.9  354.            139          475.         248.
## 6 2007-06   5.8  351.            143          477.         246.
## # ... with 3 more variables: `Private Bus` <dbl>, RIDE <dbl>, `Trackless
## #   Trolley` <dbl>

Ex7: Separating columns

month and year are together in the same column

separate() should take 3 arguments: * A data frame * A column name (no quotes) to separate * A character vector containing names of new columns (with quotes)

# Split month column into month and year: mbta6
# at the dash
mbta6 <- separate(mbta5, month, c('year', 'month'))
head(mbta6)
## # A tibble: 6 x 10
##   year  month  Boat   Bus `Commuter Rail` `Heavy Rail` `Light Rail`
##   <chr> <chr> <dbl> <dbl>           <dbl>        <dbl>        <dbl>
## 1 2007  01      4    336.            142.         435.         227.
## 2 2007  02      3.6  339.            138.         448.         240.
## 3 2007  03     40    340.            138.         459.         241.
## 4 2007  04      4.3  352.            140.         472.         256.
## 5 2007  05      4.9  354.            139          475.         248.
## 6 2007  06      5.8  351.            143          477.         246.
## # ... with 3 more variables: `Private Bus` <dbl>, RIDE <dbl>, `Trackless
## #   Trolley` <dbl>

Ex8: values = reasonable?

screen the data for any obvious mistakes and/or outliers

summary(mbta6) # note Boat column stats.
##      year              month                Boat             Bus       
##  Length:58          Length:58          Min.   : 2.985   Min.   :312.9  
##  Class :character   Class :character   1st Qu.: 3.494   1st Qu.:345.6  
##  Mode  :character   Mode  :character   Median : 4.293   Median :359.9  
##                                        Mean   : 5.068   Mean   :358.6  
##                                        3rd Qu.: 5.356   3rd Qu.:372.2  
##                                        Max.   :40.000   Max.   :398.5  
##  Commuter Rail     Heavy Rail      Light Rail     Private Bus   
##  Min.   :121.4   Min.   :435.3   Min.   :194.4   Min.   :2.213  
##  1st Qu.:131.4   1st Qu.:471.1   1st Qu.:220.6   1st Qu.:2.641  
##  Median :138.8   Median :487.3   Median :231.9   Median :2.820  
##  Mean   :137.4   Mean   :489.3   Mean   :233.0   Mean   :3.352  
##  3rd Qu.:142.4   3rd Qu.:511.3   3rd Qu.:244.5   3rd Qu.:4.167  
##  Max.   :153.0   Max.   :554.9   Max.   :271.1   Max.   :4.878  
##       RIDE       Trackless Trolley
##  Min.   :4.900   Min.   : 5.777   
##  1st Qu.:5.965   1st Qu.:11.679   
##  Median :6.615   Median :12.598   
##  Mean   :6.604   Mean   :12.125   
##  3rd Qu.:7.149   3rd Qu.:13.320   
##  Max.   :8.598   Max.   :15.109
hist(mbta6$Boat) # every value clustered around 4 and one loner out around 40

Ex9: entry error

Every month, average weekday commuter boat ridership was on either side of four thousand jumped to 40 thousand!? error => locate and amend to 4

Find the row number of the incorrect value

# numeric variable i 
# store the index of the incorrect Boat value in mbta6
# Combine a call to which() 
# with a comparison operator (i.e. >)
# to determine the row number
i <- which(mbta6$Boat == 40)

Replace the incorrect value with 4

mbta6$Boat[i] <- 4

Verify that the change was made by looking at another histogram of mbta6$Boat

hist(mbta6$Boat)

following plots: use Exercise 4: data: long version

set up the df for use in the ggplot() functions

head(mbta6)
## # A tibble: 6 x 10
##   year  month  Boat   Bus `Commuter Rail` `Heavy Rail` `Light Rail`
##   <chr> <chr> <dbl> <dbl>           <dbl>        <dbl>        <dbl>
## 1 2007  01      4    336.            142.         435.         227.
## 2 2007  02      3.6  339.            138.         448.         240.
## 3 2007  03      4    340.            138.         459.         241.
## 4 2007  04      4.3  352.            140.         472.         256.
## 5 2007  05      4.9  354.            139          475.         248.
## 6 2007  06      5.8  351.            143          477.         246.
## # ... with 3 more variables: `Private Bus` <dbl>, RIDE <dbl>, `Trackless
## #   Trolley` <dbl>
table(mbta4$mode)
## 
##              Boat               Bus     Commuter Rail        Heavy Rail 
##                58                58                58                58 
##        Light Rail       Private Bus              RIDE Trackless Trolley 
##                58                58                58                58
mbta_all <- mbta6 %>%
  unite(year_mon, year, month, sep = "") %>%
  gather(mode, thou_riders, -year_mon)
  
mbta_boat <- mbta_all %>%
  filter(mode %in% c("Boat","Trackless Trolley"))
head(mbta_boat)
## # A tibble: 6 x 3
##   year_mon mode  thou_riders
##   <chr>    <chr>       <dbl>
## 1 200701   Boat          4  
## 2 200702   Boat          3.6
## 3 200703   Boat          4  
## 4 200704   Boat          4.3
## 5 200705   Boat          4.9
## 6 200706   Boat          5.8
table(mbta_boat$mode)
## 
##              Boat Trackless Trolley 
##                58                58

Look at Boat and Trackless Trolley ridership over time Note: seasonal variation in Boat ridership

# install.packages("ggplot2")
library(ggplot2)
ggplot(mbta_boat, aes(x = year_mon, y = thou_riders, col = mode)) +  geom_point() + 
  scale_x_discrete(name = "Month", breaks = c(200701, 200801, 200901, 201001, 201101)) + 
  scale_y_continuous(name = "Avg Weekday Ridership (thousands)")

Look at all T ridership over time

ggplot(mbta_all, aes(x = year_mon, y = thou_riders, col = mode)) + geom_point() + 
  scale_x_discrete(name = "Month", breaks = c(200701, 200801, 200901, 201001, 201101)) +  
  scale_y_continuous(name = "Avg Weekday Ridership (thousands)")

World Food

nutrition analysis / sugar content

Ex1: Import data


library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday,
##     week, yday, year
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

url_food <- 'http://s3.amazonaws.com/assets.datacamp.com/production/course_1294/datasets/food.csv'
download.file(url_food, "food.csv")

# data table
dt_food <- fread(url_food) # FASTER than read.csv()

# Convert food to a data frame
df_food <- data.frame(dt_food)
food <- df_food

Ex2: Examine data

summary(food)
##        V1              code            url              creator         
##  Min.   :   1.0   Min.   :100030   Length:1500        Length:1500       
##  1st Qu.: 375.8   1st Qu.:124975   Class :character   Class :character  
##  Median : 750.5   Median :149514   Mode  :character   Mode  :character  
##  Mean   : 750.5   Mean   :149613                                        
##  3rd Qu.:1125.2   3rd Qu.:174506                                        
##  Max.   :1500.0   Max.   :199880                                        
##                                                                         
##    created_t         created_datetime   last_modified_t    
##  Min.   :1.332e+09   Length:1500        Min.   :1.340e+09  
##  1st Qu.:1.394e+09   Class :character   1st Qu.:1.424e+09  
##  Median :1.425e+09   Mode  :character   Median :1.437e+09  
##  Mean   :1.414e+09                      Mean   :1.430e+09  
##  3rd Qu.:1.436e+09                      3rd Qu.:1.446e+09  
##  Max.   :1.453e+09                      Max.   :1.453e+09  
##                                                            
##  last_modified_datetime product_name       generic_name      
##  Length:1500            Length:1500        Length:1500       
##  Class :character       Class :character   Class :character  
##  Mode  :character       Mode  :character   Mode  :character  
##                                                              
##                                                              
##                                                              
##                                                              
##    quantity          packaging         packaging_tags    
##  Length:1500        Length:1500        Length:1500       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##     brands          brands_tags         categories       
##  Length:1500        Length:1500        Length:1500       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  categories_tags    categories_en        origins         
##  Length:1500        Length:1500        Length:1500       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  origins_tags       manufacturing_places manufacturing_places_tags
##  Length:1500        Length:1500          Length:1500              
##  Class :character   Class :character     Class :character         
##  Mode  :character   Mode  :character     Mode  :character         
##                                                                   
##                                                                   
##                                                                   
##                                                                   
##     labels          labels_tags         labels_en        
##  Length:1500        Length:1500        Length:1500       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   emb_codes         emb_codes_tags     first_packaging_code_geo
##  Length:1500        Length:1500        Length:1500             
##  Class :character   Class :character   Class :character        
##  Mode  :character   Mode  :character   Mode  :character        
##                                                                
##                                                                
##                                                                
##                                                                
##   cities        cities_tags        purchase_places       stores         
##  Mode:logical   Length:1500        Length:1500        Length:1500       
##  NA's:1500      Class :character   Class :character   Class :character  
##                 Mode  :character   Mode  :character   Mode  :character  
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##   countries         countries_tags     countries_en      
##  Length:1500        Length:1500        Length:1500       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  ingredients_text    allergens         allergens_en      traces         
##  Length:1500        Length:1500        Mode:logical   Length:1500       
##  Class :character   Class :character   NA's:1500      Class :character  
##  Mode  :character   Mode  :character                  Mode  :character  
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##  traces_tags         traces_en         serving_size       no_nutriments 
##  Length:1500        Length:1500        Length:1500        Mode:logical  
##  Class :character   Class :character   Class :character   NA's:1500     
##  Mode  :character   Mode  :character   Mode  :character                 
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##   additives_n      additives         additives_tags     additives_en      
##  Min.   : 0.000   Length:1500        Length:1500        Length:1500       
##  1st Qu.: 0.000   Class :character   Class :character   Class :character  
##  Median : 1.000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 1.846                                                           
##  3rd Qu.: 3.000                                                           
##  Max.   :17.000                                                           
##  NA's   :514                                                              
##  ingredients_from_palm_oil_n ingredients_from_palm_oil
##  Min.   :0.0000              Mode:logical             
##  1st Qu.:0.0000              NA's:1500                
##  Median :0.0000                                       
##  Mean   :0.0487                                       
##  3rd Qu.:0.0000                                       
##  Max.   :1.0000                                       
##  NA's   :514                                          
##  ingredients_from_palm_oil_tags ingredients_that_may_be_from_palm_oil_n
##  Length:1500                    Min.   :0.0000                         
##  Class :character               1st Qu.:0.0000                         
##  Mode  :character               Median :0.0000                         
##                                 Mean   :0.1379                         
##                                 3rd Qu.:0.0000                         
##                                 Max.   :4.0000                         
##                                 NA's   :514                            
##  ingredients_that_may_be_from_palm_oil
##  Mode:logical                         
##  NA's:1500                            
##                                       
##                                       
##                                       
##                                       
##                                       
##  ingredients_that_may_be_from_palm_oil_tags nutrition_grade_uk
##  Length:1500                                Mode:logical      
##  Class :character                           NA's:1500         
##  Mode  :character                                             
##                                                               
##                                                               
##                                                               
##                                                               
##  nutrition_grade_fr pnns_groups_1      pnns_groups_2     
##  Length:1500        Length:1500        Length:1500       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##     states          states_tags         states_en        
##  Length:1500        Length:1500        Length:1500       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  main_category      main_category_en    image_url        
##  Length:1500        Length:1500        Length:1500       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  image_small_url     energy_100g     energy_from_fat_100g    fat_100g     
##  Length:1500        Min.   :   0.0   Min.   :   0.00      Min.   :  0.00  
##  Class :character   1st Qu.: 369.8   1st Qu.:  35.98      1st Qu.:  0.90  
##  Mode  :character   Median : 966.5   Median : 237.00      Median :  6.00  
##                     Mean   :1083.2   Mean   : 668.41      Mean   : 13.39  
##                     3rd Qu.:1641.5   3rd Qu.: 974.00      3rd Qu.: 20.00  
##                     Max.   :3700.0   Max.   :2900.00      Max.   :100.00  
##                     NA's   :700      NA's   :1486         NA's   :708     
##  saturated_fat_100g butyric_acid_100g caproic_acid_100g caprylic_acid_100g
##  Min.   : 0.000     Mode:logical      Mode:logical      Mode:logical      
##  1st Qu.: 0.200     NA's:1500         NA's:1500         NA's:1500         
##  Median : 1.700                                                           
##  Mean   : 4.874                                                           
##  3rd Qu.: 6.500                                                           
##  Max.   :57.000                                                           
##  NA's   :797                                                              
##  capric_acid_100g lauric_acid_100g myristic_acid_100g palmitic_acid_100g
##  Mode:logical     Mode:logical     Mode:logical       Mode:logical      
##  NA's:1500        NA's:1500        NA's:1500          NA's:1500         
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##  stearic_acid_100g arachidic_acid_100g behenic_acid_100g
##  Mode:logical      Mode:logical        Mode:logical     
##  NA's:1500         NA's:1500           NA's:1500        
##                                                         
##                                                         
##                                                         
##                                                         
##                                                         
##  lignoceric_acid_100g cerotic_acid_100g montanic_acid_100g
##  Mode:logical         Mode:logical      Mode:logical      
##  NA's:1500            NA's:1500         NA's:1500         
##                                                           
##                                                           
##                                                           
##                                                           
##                                                           
##  melissic_acid_100g monounsaturated_fat_100g polyunsaturated_fat_100g
##  Mode:logical       Min.   : 0.00            Min.   : 0.400          
##  NA's:1500          1st Qu.: 3.87            1st Qu.: 1.653          
##                     Median : 9.50            Median : 3.900          
##                     Mean   :19.77            Mean   : 9.986          
##                     3rd Qu.:29.00            3rd Qu.:12.700          
##                     Max.   :75.00            Max.   :46.200          
##                     NA's   :1465             NA's   :1464            
##  omega_3_fat_100g alpha_linolenic_acid_100g eicosapentaenoic_acid_100g
##  Min.   : 0.033   Min.   :0.0800            Min.   :0.721             
##  1st Qu.: 1.300   1st Qu.:0.0905            1st Qu.:0.721             
##  Median : 3.000   Median :0.1010            Median :0.721             
##  Mean   : 3.726   Mean   :0.1737            Mean   :0.721             
##  3rd Qu.: 3.200   3rd Qu.:0.2205            3rd Qu.:0.721             
##  Max.   :12.400   Max.   :0.3400            Max.   :0.721             
##  NA's   :1491     NA's   :1497              NA's   :1499              
##  docosahexaenoic_acid_100g omega_6_fat_100g linoleic_acid_100g
##  Min.   :1.09              Min.   :0.25     Min.   :0.5000    
##  1st Qu.:1.09              1st Qu.:0.25     1st Qu.:0.5165    
##  Median :1.09              Median :0.25     Median :0.5330    
##  Mean   :1.09              Mean   :0.25     Mean   :0.5330    
##  3rd Qu.:1.09              3rd Qu.:0.25     3rd Qu.:0.5495    
##  Max.   :1.09              Max.   :0.25     Max.   :0.5660    
##  NA's   :1499              NA's   :1499     NA's   :1498      
##  arachidonic_acid_100g gamma_linolenic_acid_100g
##  Mode:logical          Mode:logical             
##  NA's:1500             NA's:1500                
##                                                 
##                                                 
##                                                 
##                                                 
##                                                 
##  dihomo_gamma_linolenic_acid_100g omega_9_fat_100g oleic_acid_100g
##  Mode:logical                     Mode:logical     Mode:logical   
##  NA's:1500                        NA's:1500        NA's:1500      
##                                                                   
##                                                                   
##                                                                   
##                                                                   
##                                                                   
##  elaidic_acid_100g gondoic_acid_100g mead_acid_100g erucic_acid_100g
##  Mode:logical      Mode:logical      Mode:logical   Mode:logical    
##  NA's:1500         NA's:1500         NA's:1500      NA's:1500       
##                                                                     
##                                                                     
##                                                                     
##                                                                     
##                                                                     
##  nervonic_acid_100g trans_fat_100g   cholesterol_100g carbohydrates_100g
##  Mode:logical       Min.   :0.0000   Min.   :0.0000   Min.   :  0.000   
##  NA's:1500          1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:  3.792   
##                     Median :0.0000   Median :0.0000   Median : 13.500   
##                     Mean   :0.0105   Mean   :0.0265   Mean   : 27.958   
##                     3rd Qu.:0.0000   3rd Qu.:0.0026   3rd Qu.: 55.000   
##                     Max.   :0.1000   Max.   :0.4300   Max.   :100.000   
##                     NA's   :1481     NA's   :1477     NA's   :708       
##   sugars_100g     sucrose_100g   glucose_100g   fructose_100g 
##  Min.   :  0.00   Mode:logical   Mode:logical   Min.   :100   
##  1st Qu.:  1.00   NA's:1500      NA's:1500      1st Qu.:100   
##  Median :  4.05                                 Median :100   
##  Mean   : 12.66                                 Mean   :100   
##  3rd Qu.: 14.70                                 3rd Qu.:100   
##  Max.   :100.00                                 Max.   :100   
##  NA's   :788                                    NA's   :1499  
##   lactose_100g   maltose_100g   maltodextrins_100g  starch_100g   
##  Min.   :0.000   Mode:logical   Mode:logical       Min.   : 0.00  
##  1st Qu.:0.250   NA's:1500      NA's:1500          1st Qu.: 9.45  
##  Median :0.500                                     Median :39.50  
##  Mean   :2.933                                     Mean   :30.73  
##  3rd Qu.:4.400                                     3rd Qu.:42.85  
##  Max.   :8.300                                     Max.   :71.00  
##  NA's   :1497                                      NA's   :1493   
##   polyols_100g     fiber_100g     proteins_100g     casein_100g  
##  Min.   : 8.60   Min.   : 0.000   Min.   : 0.000   Min.   :1.1   
##  1st Qu.:59.10   1st Qu.: 0.500   1st Qu.: 1.500   1st Qu.:1.1   
##  Median :67.00   Median : 1.750   Median : 6.000   Median :1.1   
##  Mean   :56.06   Mean   : 2.823   Mean   : 7.563   Mean   :1.1   
##  3rd Qu.:69.80   3rd Qu.: 3.500   3rd Qu.:10.675   3rd Qu.:1.1   
##  Max.   :70.00   Max.   :46.700   Max.   :61.000   Max.   :1.1   
##  NA's   :1491    NA's   :994      NA's   :710      NA's   :1499  
##  serum_proteins_100g nucleotides_100g   salt_100g         sodium_100g     
##  Mode:logical        Mode:logical     Min.   :  0.0000   Min.   : 0.0000  
##  NA's:1500           NA's:1500        1st Qu.:  0.0438   1st Qu.: 0.0172  
##                                       Median :  0.4498   Median : 0.1771  
##                                       Mean   :  1.1205   Mean   : 0.4409  
##                                       3rd Qu.:  1.1938   3rd Qu.: 0.4700  
##                                       Max.   :102.0000   Max.   :40.0000  
##                                       NA's   :780        NA's   :780      
##   alcohol_100g   vitamin_a_100g   beta_carotene_100g vitamin_d_100g 
##  Min.   : 0.00   Min.   :0.0000   Mode:logical       Min.   :0e+00  
##  1st Qu.: 0.00   1st Qu.:0.0000   NA's:1500          1st Qu.:0e+00  
##  Median : 5.50   Median :0.0001                      Median :0e+00  
##  Mean   :10.07   Mean   :0.0003                      Mean   :0e+00  
##  3rd Qu.:13.00   3rd Qu.:0.0006                      3rd Qu.:0e+00  
##  Max.   :50.00   Max.   :0.0013                      Max.   :1e-04  
##  NA's   :1433    NA's   :1477                        NA's   :1485   
##  vitamin_e_100g   vitamin_k_100g vitamin_c_100g  vitamin_b1_100g 
##  Min.   :0.0005   Min.   :0      Min.   :0.000   Min.   :0.0001  
##  1st Qu.:0.0021   1st Qu.:0      1st Qu.:0.002   1st Qu.:0.0003  
##  Median :0.0044   Median :0      Median :0.019   Median :0.0004  
##  Mean   :0.0069   Mean   :0      Mean   :0.025   Mean   :0.0006  
##  3rd Qu.:0.0097   3rd Qu.:0      3rd Qu.:0.030   3rd Qu.:0.0010  
##  Max.   :0.0320   Max.   :0      Max.   :0.217   Max.   :0.0013  
##  NA's   :1478     NA's   :1498   NA's   :1459    NA's   :1478    
##  vitamin_b2_100g  vitamin_pp_100g  vitamin_b6_100g  vitamin_b9_100g
##  Min.   :0.0002   Min.   :0.0006   Min.   :0.0001   Min.   :0e+00  
##  1st Qu.:0.0003   1st Qu.:0.0033   1st Qu.:0.0002   1st Qu.:0e+00  
##  Median :0.0009   Median :0.0069   Median :0.0008   Median :1e-04  
##  Mean   :0.0011   Mean   :0.0086   Mean   :0.0112   Mean   :1e-04  
##  3rd Qu.:0.0013   3rd Qu.:0.0140   3rd Qu.:0.0012   3rd Qu.:2e-04  
##  Max.   :0.0066   Max.   :0.0160   Max.   :0.2000   Max.   :2e-04  
##  NA's   :1483     NA's   :1484     NA's   :1481     NA's   :1483   
##  vitamin_b12_100g  biotin_100g   pantothenic_acid_100g  silica_100g   
##  Min.   :0        Min.   :0      Min.   :0.0000        Min.   :8e-04  
##  1st Qu.:0        1st Qu.:0      1st Qu.:0.0007        1st Qu.:8e-04  
##  Median :0        Median :0      Median :0.0020        Median :8e-04  
##  Mean   :0        Mean   :0      Mean   :0.0027        Mean   :8e-04  
##  3rd Qu.:0        3rd Qu.:0      3rd Qu.:0.0051        3rd Qu.:8e-04  
##  Max.   :0        Max.   :0      Max.   :0.0060        Max.   :8e-04  
##  NA's   :1489     NA's   :1498   NA's   :1486          NA's   :1499   
##  bicarbonate_100g potassium_100g   chloride_100g     calcium_100g   
##  Min.   :0.0006   Min.   :0.0000   Min.   :0.0003   Min.   :0.0000  
##  1st Qu.:0.0678   1st Qu.:0.0650   1st Qu.:0.0006   1st Qu.:0.0450  
##  Median :0.1350   Median :0.1940   Median :0.0009   Median :0.1200  
##  Mean   :0.1692   Mean   :0.3288   Mean   :0.0144   Mean   :0.2040  
##  3rd Qu.:0.2535   3rd Qu.:0.3670   3rd Qu.:0.0214   3rd Qu.:0.1985  
##  Max.   :0.3720   Max.   :1.4300   Max.   :0.0420   Max.   :1.0000  
##  NA's   :1497     NA's   :1487     NA's   :1497     NA's   :1449    
##  phosphorus_100g    iron_100g      magnesium_100g     zinc_100g     
##  Min.   :0.0430   Min.   :0.0000   Min.   :0.0000   Min.   :0.0005  
##  1st Qu.:0.1938   1st Qu.:0.0012   1st Qu.:0.0670   1st Qu.:0.0009  
##  Median :0.3185   Median :0.0042   Median :0.1040   Median :0.0017  
##  Mean   :0.3777   Mean   :0.0045   Mean   :0.1066   Mean   :0.0016  
##  3rd Qu.:0.4340   3rd Qu.:0.0077   3rd Qu.:0.1300   3rd Qu.:0.0022  
##  Max.   :1.1550   Max.   :0.0137   Max.   :0.3330   Max.   :0.0026  
##  NA's   :1488     NA's   :1463     NA's   :1479     NA's   :1493    
##   copper_100g    manganese_100g fluoride_100g  selenium_100g 
##  Min.   :0e+00   Min.   :0      Min.   :0      Min.   :0     
##  1st Qu.:1e-04   1st Qu.:0      1st Qu.:0      1st Qu.:0     
##  Median :1e-04   Median :0      Median :0      Median :0     
##  Mean   :1e-04   Mean   :0      Mean   :0      Mean   :0     
##  3rd Qu.:1e-04   3rd Qu.:0      3rd Qu.:0      3rd Qu.:0     
##  Max.   :1e-04   Max.   :0      Max.   :0      Max.   :0     
##  NA's   :1498    NA's   :1499   NA's   :1498   NA's   :1499  
##  chromium_100g  molybdenum_100g  iodine_100g   caffeine_100g 
##  Mode:logical   Mode:logical    Min.   :0      Mode:logical  
##  NA's:1500      NA's:1500       1st Qu.:0      NA's:1500     
##                                 Median :0                    
##                                 Mean   :0                    
##                                 3rd Qu.:0                    
##                                 Max.   :0                    
##                                 NA's   :1499                 
##  taurine_100g   ph_100g        fruits_vegetables_nuts_100g
##  Mode:logical   Mode:logical   Min.   : 2.00              
##  NA's:1500      NA's:1500      1st Qu.:11.25              
##                                Median :42.00              
##                                Mean   :36.88              
##                                3rd Qu.:52.25              
##                                Max.   :80.00              
##                                NA's   :1470               
##  collagen_meat_protein_ratio_100g   cocoa_100g   chlorophyl_100g
##  Min.   :12.00                    Min.   :30     Mode:logical   
##  1st Qu.:13.50                    1st Qu.:47     NA's:1500      
##  Median :15.00                    Median :60                    
##  Mean   :15.67                    Mean   :57                    
##  3rd Qu.:17.50                    3rd Qu.:70                    
##  Max.   :20.00                    Max.   :81                    
##  NA's   :1497                     NA's   :1491                  
##  carbon_footprint_100g nutrition_score_fr_100g nutrition_score_uk_100g
##  Min.   : 12.00        Min.   :-12.000         Min.   :-12.000        
##  1st Qu.: 97.42        1st Qu.:  1.000         1st Qu.:  0.000        
##  Median :182.85        Median :  7.000         Median :  6.000        
##  Mean   :131.18        Mean   :  7.941         Mean   :  7.631        
##  3rd Qu.:190.78        3rd Qu.: 15.000         3rd Qu.: 16.000        
##  Max.   :198.70        Max.   : 28.000         Max.   : 28.000        
##  NA's   :1497          NA's   :825             NA's   :825
head(food)
##   V1   code
## 1  1 100030
## 2  2 100050
## 3  3 100079
## 4  4 100094
## 5  5 100124
## 6  6 100136
##                                                                                                                            url
## 1 http://world-en.openfoodfacts.org/product/3222475745867/confiture-de-fraise-fraise-des-bois-au-sucre-de-canne-casino-delices
## 2                                         http://world-en.openfoodfacts.org/product/5410976880110/guylian-sea-shells-selection
## 3                                  http://world-en.openfoodfacts.org/product/3264750423503/pates-de-fruits-aromatisees-jacquot
## 4                                  http://world-en.openfoodfacts.org/product/8006040247001/nata-vegetal-a-base-de-soja-valsoia
## 5           http://world-en.openfoodfacts.org/product/8480000340764/semillas-de-girasol-con-cascara-tostadas-aguasal-hacendado
## 6                                                           http://world-en.openfoodfacts.org/product/0087703177727/soft-drink
##       creator  created_t     created_datetime last_modified_t
## 1    sebleouf 1424747544 2015-02-24T03:12:24Z      1438445887
## 2 foodorigins 1450316429 2015-12-17T01:40:29Z      1450817956
## 3    domdom26 1428674916 2015-04-10T14:08:36Z      1428739289
## 4     javichu 1420416591 2015-01-05T00:09:51Z      1420417876
## 5     javichu 1420501121 2015-01-05T23:38:41Z      1445700917
## 6 foodorigins 1437983923 2015-07-27T07:58:43Z      1445577476
##   last_modified_datetime
## 1   2015-08-01T16:18:07Z
## 2   2015-12-22T20:59:16Z
## 3   2015-04-11T08:01:29Z
## 4   2015-01-05T00:31:16Z
## 5   2015-10-24T15:35:17Z
## 6   2015-10-23T05:17:56Z
##                                            product_name
## 1 Confiture de fraise fraise des bois au sucre de canne
## 2                          Guylian Sea Shells Selection
## 3                         Pâtes de fruits aromatisées
## 4       Nata vegetal a base de soja &quot;Valsoia&quot;
## 5     Semillas de girasol con cáscara tostadas aguasal
## 6                                            Soft Drink
##                                        generic_name quantity
## 1                                                      265 g
## 2                                                       375g
## 3                                  Pâtes de fruits     1 kg
## 4                       Nata vegetal a base de soja   200 ml
## 5 Semillas de girasol con cáscara tostadas aguasal    200 g
## 6                                                           
##                                              packaging
## 1                                          Bocal,Verre
## 2                                          Plastic,Box
## 3                                     Carton,plastique
## 4                                           Tetra Brik
## 5 Bolsa de plástico,Envasado en atmósfera protectora
## 6                                                     
##                                       packaging_tags
## 1                                        bocal,verre
## 2                                        plastic,box
## 3                                   carton,plastique
## 4                                         tetra-brik
## 5 bolsa-de-plastico,envasado-en-atmosfera-protectora
## 6                                                   
##                                       brands
## 1                            Casino Délices
## 2                                    Guylian
## 3                                    Jacquot
## 4   Valsoia,//Propiedad de://,Valsoia S.p.A.
## 5 Hacendado,//Propiedad de://,Mercadona S.A.
## 6                                           
##                            brands_tags
## 1                       casino-delices
## 2                              guylian
## 3                              jacquot
## 4   valsoia,propiedad-de,valsoia-s-p-a
## 5 hacendado,propiedad-de,mercadona-s-a
## 6                                     
##                                                                                                                                                                                                                                                                                                                                                    categories
## 1 Aliments et boissons à base de végétaux,Aliments d'origine végétale,Aliments à base de fruits et de légumes,Petit-déjeuners,Produits à tartiner,Fruits et produits dérivés,Pâtes à tartiner végétaux,Produits à tartiner sucrés,Confitures et marmelades,Confitures,Confitures de fruits,Confitures de fruits rouges,Confitures de fraises
## 2                                                                                                                                                                                                                                                                                                                                                   Chocolate
## 3                                                                                                                                                                                                                                                                                                                                            pâtes de fruits
## 4                                                                                                                                                                                                  Alimentos y bebidas de origen vegetal,Alimentos de origen vegetal,Natas vegetales,Natas vegetales a base de soja para cocinar,Natas vegetales para cocinar
## 5                                                                                                                                Semillas de girasol y derivados, Semillas, Semillas de girasol, Semillas de girasol con cáscara, Semillas de girasol tostadas, Semillas de girasol con cáscara tostadas, Semillas de girasol con cáscara tostadas aguasal
## 6                                                                                                                                                                                                                                                                                                                                                            
##                                                                                                                                                                                                                                                              categories_tags
## 1              en:plant-based-foods-and-beverages,en:plant-based-foods,en:fruits-and-vegetables-based-foods,en:breakfasts,en:spreads,en:fruits-based-foods,en:plant-based-spreads,en:sweet-spreads,en:fruit-preserves,en:jams,en:fruit-jams,en:berry-jams,en:strawberry-jams
## 2                                                                                                                                                                                                                                             en:sugary-snacks,en:chocolates
## 3                                                                                                     en:plant-based-foods-and-beverages,en:plant-based-foods,en:fruits-and-vegetables-based-foods,en:sugary-snacks,en:confectioneries,en:fruits-based-foods,en:fruit-pastes
## 4                                                                                                                            en:plant-based-foods-and-beverages,en:plant-based-foods,en:plant-based-creams,en:plant-based-creams-for-cooking,en:soy-based-creams-for-cooking
## 5 en:plant-based-foods-and-beverages,en:plant-based-foods,en:seeds,en:sunflower-seeds-and-their-products,en:sunflower-seeds,en:roasted-sunflower-seeds,en:unshelled-sunflower-seeds,en:roasted-unshelled-sunflower-seeds,es:semillas-de-girasol-con-cascara-tostadas-aguasal
## 6                                                                                                                                                                                                                                                                           
##                                                                                                                                                                                                                                        categories_en
## 1                             Plant-based foods and beverages,Plant-based foods,Fruits and vegetables based foods,Breakfasts,Spreads,Fruits based foods,Plant-based spreads,Sweet spreads,Fruit preserves,Jams,Fruit jams,Berry jams,Strawberry jams
## 2                                                                                                                                                                                                                           Sugary snacks,Chocolates
## 3                                                                                                  Plant-based foods and beverages,Plant-based foods,Fruits and vegetables based foods,Sugary snacks,Confectioneries,Fruits based foods,Fruit pastes
## 4                                                                                                                   Plant-based foods and beverages,Plant-based foods,Plant-based creams,Plant-based creams for cooking,Soy-based creams for cooking
## 5 Plant-based foods and beverages,Plant-based foods,Seeds,Sunflower seeds and their products,Sunflower seeds,Roasted sunflower seeds,Unshelled sunflower seeds,Roasted unshelled sunflower seeds,es:Semillas-de-girasol-con-cascara-tostadas-aguasal
## 6                                                                                                                                                                                                                                                   
##       origins origins_tags
## 1                         
## 2                         
## 3                         
## 4                         
## 5   Argentina    argentina
## 6 South Korea  south-korea
##                                            manufacturing_places
## 1                                                        France
## 2                                                       Belgium
## 3                                                              
## 4                                                        Italia
## 5 Beniparrell,Valencia (provincia),Comunidad Valenciana,España
## 6                                                   South Korea
##                                    manufacturing_places_tags
## 1                                                     france
## 2                                                    belgium
## 3                                                           
## 4                                                     italia
## 5 beniparrell,valencia-provincia,comunidad-valenciana,espana
## 6                                                south-korea
##                                              labels
## 1                                                  
## 2                                                  
## 3                                                  
## 4 Vegetariano,Vegano,Sin gluten,Sin OMG,Sin lactosa
## 5                     Vegetariano,Vegano,Sin gluten
## 6                                                  
##                                                      labels_tags
## 1                                                               
## 2                                                               
## 3                                                               
## 4 en:vegetarian,en:vegan,en:gluten-free,en:no-gmos,en:no-lactose
## 5                          en:vegetarian,en:vegan,en:gluten-free
## 6                                                               
##                                         labels_en
## 1                                                
## 2                                                
## 3                                                
## 4 Vegetarian,Vegan,Gluten-free,No GMOs,No lactose
## 5                    Vegetarian,Vegan,Gluten-free
## 6                                                
##                                     emb_codes
## 1                                   EMB 78015
## 2                                            
## 3                                            
## 4                                            
## 5 ES 21.016540/V EC,ENVASADOR:,IMPORTACO S.A.
## 6                                            
##                              emb_codes_tags first_packaging_code_geo
## 1                                 emb-78015       48.983333,2.066667
## 2                                                                   
## 3                                                                   
## 4                                                                   
## 5 es-21-016540-v-ec,envasador,importaco-s-a                         
## 6                                                                   
##   cities             cities_tags purchase_places           stores
## 1     NA andresy-yvelines-france     Lyon,France           Casino
## 2     NA                           NSW,Australia                 
## 3     NA                                  France                 
## 4     NA                          Madrid,España El Corte Inglés
## 5     NA                          Madrid,España        Mercadona
## 6     NA                                                         
##   countries countries_tags countries_en
## 1    France      en:france       France
## 2 Australia   en:australia    Australia
## 3    France      en:france       France
## 4   España       en:spain        Spain
## 5   España       en:spain        Spain
## 6 Australia   en:australia    Australia
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 ingredients_text
## 1                                                                                                                                                                                                                                                                                                                                                                     Sucre de canne, fraises 40 g, fraises des bois 14 g, gélifiant : pectines de fruits, jus de citron concentré. Préparée avec 54 g de fruits pour 100 g de produit fini.
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
## 3                                                                                                                                                                                                                                                                                                                        Pulpe de pommes 50% , sucre, sirop de glucose, gélifiant : pectine, acidifiant : acide citrique, arômes, colorants naturels : extrait de paprika â\200” complexes cuivreâ\200”chlorophyllines â\200” curcumine â\200” antnocyanes
## 4 Extracto de soja (78%) (agua, semillas de soja 8,3%), grasas vegetales, jarabe de glucosa, dextrosa, emulsionante: mono- y diglicéridos de ácidos grasos (E-471), sal marina, estabilizantes: goma xantana (E-415), carragenatos (E-407), goma guar (E-412); aromas, antioxidante: extractos de tocoferoles (de soja) (E-306). (Nota: el envase en italiano del paquete -que puede verse en el enlace-, especifica que el producto es 100% vegetal. Por tanto los mono- y diglicéridos de ácidos grasos (E-471) son de origen no animal). 
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        Pipas de girasol y sal.
## 6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
##   allergens allergens_en                        traces        traces_tags
## 1                     NA          Lait,Fruits à coque    en:milk,en:nuts
## 2                     NA                                                 
## 3                     NA                                                 
## 4                     NA                                                 
## 5                     NA Frutos de cáscara,Cacahuetes en:nuts,en:peanuts
## 6                     NA                                                 
##      traces_en serving_size no_nutriments additives_n
## 1    Milk,Nuts         15 g            NA           1
## 2                                      NA          NA
## 3                                      NA           2
## 4                                      NA           5
## 5 Nuts,Peanuts                         NA           0
## 6                                      NA          NA
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           additives
## 1 [ sucre-de-canne -> fr:sucre-de-canne  ]  [ sucre-de -> fr:sucre-de  ]  [ sucre -> fr:sucre  ]  [ fraises-40-g -> fr:fraises-40-g  ]  [ fraises-40 -> fr:fraises-40  ]  [ fraises -> fr:fraises  ]  [ fraises-des-bois-14-g -> fr:fraises-des-bois-14-g  ]  [ fraises-des-bois-14 -> fr:fraises-des-bois-14  ]  [ fraises-des-bois -> fr:fraises-des-bois  ]  [ fraises-des -> fr:fraises-des  ]  [ fraises -> fr:fraises  ]  [ pectines-de-fruits -> fr:pectines-de-fruits  ]  [ pectines-de -> fr:pectines-de  ]  [ pectines -> en:e440  -> exists  ]  [ jus-de-citron-concentre-preparee-avec-54-g-de-fruits-pour-100-g-de-produit-fini -> fr:jus-de-citron-concentre-preparee-avec-54-g-de-fruits-pour-100-g-de-produit-fini  ]  [ jus-de-citron-concentre-preparee-avec-54-g-de-fruits-pour-100-g-de-produit -> fr:jus-de-citron-concentre-preparee-avec-54-g-de-fruits-pour-100-g-de-produit  ]  [ jus-de-citron-concentre-preparee-avec-54-g-de-fruits-pour-100-g-de -> fr:jus-de-citron-concentre-preparee-avec-54-g-de-fruits-pour-100-g-de  ]  [ jus-de-citron-concentre-preparee-avec-54-g-de-fruits-pour-100-g -> fr:jus-de-citron-concentre-preparee-avec-54-g-de-fruits-pour-100-g  ]  [ jus-de-citron-concentre-preparee-avec-54-g-de-fruits-pour-100 -> fr:jus-de-citron-concentre-preparee-avec-54-g-de-fruits-pour-100  ]  [ jus-de-citron-concentre-preparee-avec-54-g-de-fruits-pour -> fr:jus-de-citron-concentre-preparee-avec-54-g-de-fruits-pour  ]  [ jus-de-citron-concentre-preparee-avec-54-g-de-fruits -> fr:jus-de-citron-concentre-preparee-avec-54-g-de-fruits  ]  [ jus-de-citron-concentre-preparee-avec-54-g-de -> fr:jus-de-citron-concentre-preparee-avec-54-g-de  ]  [ jus-de-citron-concentre-preparee-avec-54-g -> fr:jus-de-citron-concentre-preparee-avec-54-g  ]  [ jus-de-citron-concentre-preparee-avec-54 -> fr:jus-de-citron-concentre-preparee-avec-54  ]  [ jus-de-citron-concentre-preparee-avec -> fr:jus-de-citron-concentre-preparee-avec  ]  [ jus-de-citron-concentre-preparee -> fr:jus-de-citron-concentre-preparee  ]  [ jus-de-citron-concentre -> fr:jus-de-citron-concentre  ]  [ jus-de-citron -> fr:jus-de-citron  ]  [ jus-de -> fr:jus-de  ]  [ jus -> fr:jus  ]
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       [ pulpe-de-pommes-50 -> fr:pulpe-de-pommes-50  ]  [ pulpe-de-pommes -> fr:pulpe-de-pommes  ]  [ pulpe-de -> fr:pulpe-de  ]  [ pulpe -> fr:pulpe  ]  [ sucre -> fr:sucre  ]  [ sirop-de-glucose -> fr:sirop-de-glucose  ]  [ sirop-de -> fr:sirop-de  ]  [ sirop -> fr:sirop  ]  [ pectine -> en:e440  -> exists  ]  [ acide-citrique -> en:e330  -> exists  ]  [ aromes -> fr:aromes  ]  [ naturels -> fr:naturels  ]  [ extrait-de-paprika-complexes-cuivre-chlorophyllines-curcumine-antnocyanes -> fr:extrait-de-paprika-complexes-cuivre-chlorophyllines-curcumine-antnocyanes  ]  [ extrait-de-paprika-complexes-cuivre-chlorophyllines-curcumine -> fr:extrait-de-paprika-complexes-cuivre-chlorophyllines-curcumine  ]  [ extrait-de-paprika-complexes-cuivre-chlorophyllines -> fr:extrait-de-paprika-complexes-cuivre-chlorophyllines  ]  [ extrait-de-paprika-complexes-cuivre -> fr:extrait-de-paprika-complexes-cuivre  ]  [ extrait-de-paprika-complexes -> fr:extrait-de-paprika-complexes  ]  [ extrait-de-paprika -> fr:extrait-de-paprika  ]  [ extrait-de -> fr:extrait-de  ]  [ extrait -> fr:extrait  ]
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       [ extracto-de-soja -> es:extracto-de-soja  ]  [ 78 -> es:78  ]  [ agua -> es:agua  ]  [ semillas-de-soja-8 -> es:semillas-de-soja-8  ]  [ 3 -> en:fd-c  ]  [ grasas-vegetales -> es:grasas-vegetales  ]  [ jarabe-de-glucosa -> es:jarabe-de-glucosa  ]  [ dextrosa -> es:dextrosa  ]  [ emulsionante -> es:emulsionante  ]  [ mono-y-digliceridos-de-acidos-grasos -> en:e471  -> exists  ]  [ e471 -> en:e471  ]  [ sal-marina -> es:sal-marina  ]  [ estabilizantes -> es:estabilizantes  ]  [ goma-xantana -> en:e415  -> exists  ]  [ e415 -> en:e415  ]  [ carragenatos -> en:e407  -> exists  ]  [ e407 -> en:e407  ]  [ goma-guar -> en:e412  -> exists  ]  [ e412 -> en:e412  ]  [ aromas -> es:aromas  ]  [ antioxidante -> es:antioxidante  ]  [ extractos-de-tocoferoles -> es:extractos-de-tocoferoles  ]  [ de-soja -> es:de-soja  ]  [ e306 -> en:e306  -> exists  ]  [ nota -> es:nota  ]  [ el-envase-en-italiano-del-paquete-que-puede-verse-en-el-enlace -> es:el-envase-en-italiano-del-paquete-que-puede-verse-en-el-enlace  ]  [ especifica-que-el-producto-es-100-vegetal-por-tanto-los-mono-y-digliceridos-de-acidos-grasos -> es:especifica-que-el-producto-es-100-vegetal-por-tanto-los-mono-y-digliceridos-de-acidos-grasos  ]  [ e471 -> en:e471  ]  [ son-de-origen-no-animal -> es:son-de-origen-no-animal  ]  [   -> es:   ]
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          [ pipas-de-girasol-y-sal -> es:pipas-de-girasol-y-sal  ]
## 6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  
##                            additives_tags
## 1                                 en:e440
## 2                                        
## 3                         en:e440,en:e330
## 4 en:e471,en:e415,en:e407,en:e412,en:e306
## 5                                        
## 6                                        
##                                                                                                                        additives_en
## 1                                                                                                                    E440 - Pectins
## 2                                                                                                                                  
## 3                                                                                                 E440 - Pectins,E330 - Citric acid
## 4 E471 - Mono- and diglycerides of fatty acids,E415 - Xanthan gum,E407 - Carrageenan,E412 - Guar gum,E306 - Tocopherol-rich extract
## 5                                                                                                                                  
## 6                                                                                                                                  
##   ingredients_from_palm_oil_n ingredients_from_palm_oil
## 1                           0                        NA
## 2                          NA                        NA
## 3                           0                        NA
## 4                           0                        NA
## 5                           0                        NA
## 6                          NA                        NA
##   ingredients_from_palm_oil_tags ingredients_that_may_be_from_palm_oil_n
## 1                                                                      0
## 2                                                                     NA
## 3                                                                      0
## 4                                                                      1
## 5                                                                      0
## 6                                                                     NA
##   ingredients_that_may_be_from_palm_oil
## 1                                    NA
## 2                                    NA
## 3                                    NA
## 4                                    NA
## 5                                    NA
## 6                                    NA
##             ingredients_that_may_be_from_palm_oil_tags nutrition_grade_uk
## 1                                                                      NA
## 2                                                                      NA
## 3                                                                      NA
## 4 e471-mono-et-diglycerides-d-acides-gras-alimentaires                 NA
## 5                                                                      NA
## 6                                                                      NA
##   nutrition_grade_fr         pnns_groups_1      pnns_groups_2
## 1                  d         Sugary snacks             Sweets
## 2                            Sugary snacks Chocolate products
## 3                    Fruits and vegetables             Fruits
## 4                  d               unknown            unknown
## 5                  d               unknown            unknown
## 6                                  unknown            unknown
##                                                                                                                                                                                                                                                                                                                               states
## 1                                                                                                                                   en:to-be-checked, en:complete, en:nutrition-facts-completed, en:ingredients-completed, en:expiration-date-to-be-completed, en:characteristics-completed, en:photos-validated, en:photos-uploaded
## 2                                                                                                                                  en:to-be-completed, en:nutrition-facts-to-be-completed, en:ingredients-to-be-completed, en:expiration-date-to-be-completed, en:characteristics-completed, en:photos-validated, en:photos-uploaded
## 3                                                                                                                                   en:to-be-checked, en:complete, en:nutrition-facts-completed, en:ingredients-completed, en:expiration-date-to-be-completed, en:characteristics-completed, en:photos-validated, en:photos-uploaded
## 4                                                                                                                                         en:to-be-checked, en:complete, en:nutrition-facts-completed, en:ingredients-completed, en:expiration-date-completed, en:characteristics-completed, en:photos-validated, en:photos-uploaded
## 5                                                                                                                                         en:to-be-checked, en:complete, en:nutrition-facts-completed, en:ingredients-completed, en:expiration-date-completed, en:characteristics-completed, en:photos-validated, en:photos-uploaded
## 6 en:to-be-completed, en:nutrition-facts-to-be-completed, en:ingredients-to-be-completed, en:expiration-date-to-be-completed, en:characteristics-to-be-completed, en:categories-to-be-completed, en:brands-to-be-completed, en:packaging-to-be-completed, en:quantity-to-be-completed, en:photos-to-be-validated, en:photos-uploaded
##                                                                                                                                                                                                                                                                                                                states_tags
## 1                                                                                                                                en:to-be-checked,en:complete,en:nutrition-facts-completed,en:ingredients-completed,en:expiration-date-to-be-completed,en:characteristics-completed,en:photos-validated,en:photos-uploaded
## 2                                                                                                                              en:to-be-completed,en:nutrition-facts-to-be-completed,en:ingredients-to-be-completed,en:expiration-date-to-be-completed,en:characteristics-completed,en:photos-validated,en:photos-uploaded
## 3                                                                                                                                en:to-be-checked,en:complete,en:nutrition-facts-completed,en:ingredients-completed,en:expiration-date-to-be-completed,en:characteristics-completed,en:photos-validated,en:photos-uploaded
## 4                                                                                                                                      en:to-be-checked,en:complete,en:nutrition-facts-completed,en:ingredients-completed,en:expiration-date-completed,en:characteristics-completed,en:photos-validated,en:photos-uploaded
## 5                                                                                                                                      en:to-be-checked,en:complete,en:nutrition-facts-completed,en:ingredients-completed,en:expiration-date-completed,en:characteristics-completed,en:photos-validated,en:photos-uploaded
## 6 en:to-be-completed,en:nutrition-facts-to-be-completed,en:ingredients-to-be-completed,en:expiration-date-to-be-completed,en:characteristics-to-be-completed,en:categories-to-be-completed,en:brands-to-be-completed,en:packaging-to-be-completed,en:quantity-to-be-completed,en:photos-to-be-validated,en:photos-uploaded
##                                                                                                                                                                                                                                                                                 states_en
## 1                                                                                                                       To be checked,Complete,Nutrition facts completed,Ingredients completed,Expiration date to be completed,Characteristics completed,Photos validated,Photos uploaded
## 2                                                                                                                  To be completed,Nutrition facts to be completed,Ingredients to be completed,Expiration date to be completed,Characteristics completed,Photos validated,Photos uploaded
## 3                                                                                                                       To be checked,Complete,Nutrition facts completed,Ingredients completed,Expiration date to be completed,Characteristics completed,Photos validated,Photos uploaded
## 4                                                                                                                             To be checked,Complete,Nutrition facts completed,Ingredients completed,Expiration date completed,Characteristics completed,Photos validated,Photos uploaded
## 5                                                                                                                             To be checked,Complete,Nutrition facts completed,Ingredients completed,Expiration date completed,Characteristics completed,Photos validated,Photos uploaded
## 6 To be completed,Nutrition facts to be completed,Ingredients to be completed,Expiration date to be completed,Characteristics to be completed,Categories to be completed,Brands to be completed,Packaging to be completed,Quantity to be completed,Photos to be validated,Photos uploaded
##                        main_category                main_category_en
## 1 en:plant-based-foods-and-beverages Plant-based foods and beverages
## 2                   en:sugary-snacks                   Sugary snacks
## 3 en:plant-based-foods-and-beverages Plant-based foods and beverages
## 4 en:plant-based-foods-and-beverages Plant-based foods and beverages
## 5 en:plant-based-foods-and-beverages Plant-based foods and beverages
## 6                                                                   
##                                                                      image_url
## 1 http://en.openfoodfacts.org/images/products/322/247/574/5867/front.8.400.jpg
## 2 http://en.openfoodfacts.org/images/products/541/097/688/0110/front.7.400.jpg
## 3 http://en.openfoodfacts.org/images/products/326/475/042/3503/front.6.400.jpg
## 4 http://en.openfoodfacts.org/images/products/800/604/024/7001/front.7.400.jpg
## 5 http://en.openfoodfacts.org/images/products/848/000/034/0764/front.6.400.jpg
## 6 http://en.openfoodfacts.org/images/products/008/770/317/7727/front.8.400.jpg
##                                                                image_small_url
## 1 http://en.openfoodfacts.org/images/products/322/247/574/5867/front.8.200.jpg
## 2 http://en.openfoodfacts.org/images/products/541/097/688/0110/front.7.200.jpg
## 3 http://en.openfoodfacts.org/images/products/326/475/042/3503/front.6.200.jpg
## 4 http://en.openfoodfacts.org/images/products/800/604/024/7001/front.7.200.jpg
## 5 http://en.openfoodfacts.org/images/products/848/000/034/0764/front.6.200.jpg
## 6 http://en.openfoodfacts.org/images/products/008/770/317/7727/front.8.200.jpg
##   energy_100g energy_from_fat_100g fat_100g saturated_fat_100g
## 1         918                   NA      0.0                0.0
## 2          NA                   NA       NA                 NA
## 3          NA                   NA       NA                 NA
## 4         766                   NA     16.7                9.9
## 5        2359                   NA     45.5                5.2
## 6          NA                   NA       NA                 NA
##   butyric_acid_100g caproic_acid_100g caprylic_acid_100g capric_acid_100g
## 1                NA                NA                 NA               NA
## 2                NA                NA                 NA               NA
## 3                NA                NA                 NA               NA
## 4                NA                NA                 NA               NA
## 5                NA                NA                 NA               NA
## 6                NA                NA                 NA               NA
##   lauric_acid_100g myristic_acid_100g palmitic_acid_100g stearic_acid_100g
## 1               NA                 NA                 NA                NA
## 2               NA                 NA                 NA                NA
## 3               NA                 NA                 NA                NA
## 4               NA                 NA                 NA                NA
## 5               NA                 NA                 NA                NA
## 6               NA                 NA                 NA                NA
##   arachidic_acid_100g behenic_acid_100g lignoceric_acid_100g
## 1                  NA                NA                   NA
## 2                  NA                NA                   NA
## 3                  NA                NA                   NA
## 4                  NA                NA                   NA
## 5                  NA                NA                   NA
## 6                  NA                NA                   NA
##   cerotic_acid_100g montanic_acid_100g melissic_acid_100g
## 1                NA                 NA                 NA
## 2                NA                 NA                 NA
## 3                NA                 NA                 NA
## 4                NA                 NA                 NA
## 5                NA                 NA                 NA
## 6                NA                 NA                 NA
##   monounsaturated_fat_100g polyunsaturated_fat_100g omega_3_fat_100g
## 1                       NA                       NA               NA
## 2                       NA                       NA               NA
## 3                       NA                       NA               NA
## 4                      2.9                      3.9               NA
## 5                      9.5                     32.8               NA
## 6                       NA                       NA               NA
##   alpha_linolenic_acid_100g eicosapentaenoic_acid_100g
## 1                        NA                         NA
## 2                        NA                         NA
## 3                        NA                         NA
## 4                        NA                         NA
## 5                        NA                         NA
## 6                        NA                         NA
##   docosahexaenoic_acid_100g omega_6_fat_100g linoleic_acid_100g
## 1                        NA               NA                 NA
## 2                        NA               NA                 NA
## 3                        NA               NA                 NA
## 4                        NA               NA                 NA
## 5                        NA               NA                 NA
## 6                        NA               NA                 NA
##   arachidonic_acid_100g gamma_linolenic_acid_100g
## 1                    NA                        NA
## 2                    NA                        NA
## 3                    NA                        NA
## 4                    NA                        NA
## 5                    NA                        NA
## 6                    NA                        NA
##   dihomo_gamma_linolenic_acid_100g omega_9_fat_100g oleic_acid_100g
## 1                               NA               NA              NA
## 2                               NA               NA              NA
## 3                               NA               NA              NA
## 4                               NA               NA              NA
## 5                               NA               NA              NA
## 6                               NA               NA              NA
##   elaidic_acid_100g gondoic_acid_100g mead_acid_100g erucic_acid_100g
## 1                NA                NA             NA               NA
## 2                NA                NA             NA               NA
## 3                NA                NA             NA               NA
## 4                NA                NA             NA               NA
## 5                NA                NA             NA               NA
## 6                NA                NA             NA               NA
##   nervonic_acid_100g trans_fat_100g cholesterol_100g carbohydrates_100g
## 1                 NA             NA               NA               54.0
## 2                 NA             NA               NA                 NA
## 3                 NA             NA               NA                 NA
## 4                 NA             NA            2e-04                5.7
## 5                 NA             NA               NA               17.3
## 6                 NA             NA               NA                 NA
##   sugars_100g sucrose_100g glucose_100g fructose_100g lactose_100g
## 1        54.0           NA           NA            NA           NA
## 2          NA           NA           NA            NA           NA
## 3          NA           NA           NA            NA           NA
## 4         4.2           NA           NA            NA           NA
## 5         2.7           NA           NA            NA           NA
## 6          NA           NA           NA            NA           NA
##   maltose_100g maltodextrins_100g starch_100g polyols_100g fiber_100g
## 1           NA                 NA          NA           NA         NA
## 2           NA                 NA          NA           NA         NA
## 3           NA                 NA          NA           NA         NA
## 4           NA                 NA          NA           NA        0.2
## 5           NA                 NA          NA           NA        9.0
## 6           NA                 NA          NA           NA         NA
##   proteins_100g casein_100g serum_proteins_100g nucleotides_100g salt_100g
## 1           0.0          NA                  NA               NA    0.0000
## 2            NA          NA                  NA               NA        NA
## 3            NA          NA                  NA               NA        NA
## 4           2.9          NA                  NA               NA    0.0508
## 5          18.2          NA                  NA               NA    3.9878
## 6            NA          NA                  NA               NA        NA
##   sodium_100g alcohol_100g vitamin_a_100g beta_carotene_100g
## 1        0.00           NA             NA                 NA
## 2          NA           NA             NA                 NA
## 3          NA           NA             NA                 NA
## 4        0.02           NA             NA                 NA
## 5        1.57           NA             NA                 NA
## 6          NA           NA             NA                 NA
##   vitamin_d_100g vitamin_e_100g vitamin_k_100g vitamin_c_100g
## 1             NA             NA             NA             NA
## 2             NA             NA             NA             NA
## 3             NA             NA             NA             NA
## 4             NA             NA             NA             NA
## 5             NA             NA             NA             NA
## 6             NA             NA             NA             NA
##   vitamin_b1_100g vitamin_b2_100g vitamin_pp_100g vitamin_b6_100g
## 1              NA              NA              NA              NA
## 2              NA              NA              NA              NA
## 3              NA              NA              NA              NA
## 4              NA              NA              NA              NA
## 5              NA              NA              NA              NA
## 6              NA              NA              NA              NA
##   vitamin_b9_100g vitamin_b12_100g biotin_100g pantothenic_acid_100g
## 1              NA               NA          NA                    NA
## 2              NA               NA          NA                    NA
## 3              NA               NA          NA                    NA
## 4              NA               NA          NA                    NA
## 5              NA               NA          NA                    NA
## 6              NA               NA          NA                    NA
##   silica_100g bicarbonate_100g potassium_100g chloride_100g calcium_100g
## 1          NA               NA             NA            NA           NA
## 2          NA               NA             NA            NA           NA
## 3          NA               NA             NA            NA           NA
## 4          NA               NA             NA            NA           NA
## 5          NA               NA             NA            NA           NA
## 6          NA               NA             NA            NA           NA
##   phosphorus_100g iron_100g magnesium_100g zinc_100g copper_100g
## 1              NA        NA             NA        NA          NA
## 2              NA        NA             NA        NA          NA
## 3              NA        NA             NA        NA          NA
## 4              NA        NA             NA        NA          NA
## 5           1.155    0.0038          0.129        NA          NA
## 6              NA        NA             NA        NA          NA
##   manganese_100g fluoride_100g selenium_100g chromium_100g molybdenum_100g
## 1             NA            NA            NA            NA              NA
## 2             NA            NA            NA            NA              NA
## 3             NA            NA            NA            NA              NA
## 4             NA            NA            NA            NA              NA
## 5             NA            NA            NA            NA              NA
## 6             NA            NA            NA            NA              NA
##   iodine_100g caffeine_100g taurine_100g ph_100g
## 1          NA            NA           NA      NA
## 2          NA            NA           NA      NA
## 3          NA            NA           NA      NA
## 4          NA            NA           NA      NA
## 5          NA            NA           NA      NA
## 6          NA            NA           NA      NA
##   fruits_vegetables_nuts_100g collagen_meat_protein_ratio_100g cocoa_100g
## 1                          54                               NA         NA
## 2                          NA                               NA         NA
## 3                          NA                               NA         NA
## 4                          NA                               NA         NA
## 5                          NA                               NA         NA
## 6                          NA                               NA         NA
##   chlorophyl_100g carbon_footprint_100g nutrition_score_fr_100g
## 1              NA                    NA                      11
## 2              NA                    NA                      NA
## 3              NA                    NA                      NA
## 4              NA                    NA                      11
## 5              NA                    NA                      17
## 6              NA                    NA                      NA
##   nutrition_score_uk_100g
## 1                      11
## 2                      NA
## 3                      NA
## 4                      11
## 5                      17
## 6                      NA
str(food)
## 'data.frame':    1500 obs. of  160 variables:
##  $ V1                                        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ code                                      : int  100030 100050 100079 100094 100124 100136 100194 100221 100257 100258 ...
##  $ url                                       : chr  "http://world-en.openfoodfacts.org/product/3222475745867/confiture-de-fraise-fraise-des-bois-au-sucre-de-canne-casino-delices" "http://world-en.openfoodfacts.org/product/5410976880110/guylian-sea-shells-selection" "http://world-en.openfoodfacts.org/product/3264750423503/pates-de-fruits-aromatisees-jacquot" "http://world-en.openfoodfacts.org/product/8006040247001/nata-vegetal-a-base-de-soja-valsoia" ...
##  $ creator                                   : chr  "sebleouf" "foodorigins" "domdom26" "javichu" ...
##  $ created_t                                 : int  1424747544 1450316429 1428674916 1420416591 1420501121 1437983923 1442420988 1435686217 1436991777 1400516512 ...
##  $ created_datetime                          : chr  "2015-02-24T03:12:24Z" "2015-12-17T01:40:29Z" "2015-04-10T14:08:36Z" "2015-01-05T00:09:51Z" ...
##  $ last_modified_t                           : int  1438445887 1450817956 1428739289 1420417876 1445700917 1445577476 1442420988 1451405288 1436991779 1437236856 ...
##  $ last_modified_datetime                    : chr  "2015-08-01T16:18:07Z" "2015-12-22T20:59:16Z" "2015-04-11T08:01:29Z" "2015-01-05T00:31:16Z" ...
##  $ product_name                              : chr  "Confiture de fraise fraise des bois au sucre de canne" "Guylian Sea Shells Selection" "Pâtes de fruits aromatisées" "Nata vegetal a base de soja &quot;Valsoia&quot;" ...
##  $ generic_name                              : chr  "" "" "Pâtes de fruits" "Nata vegetal a base de soja" ...
##  $ quantity                                  : chr  "265 g" "375g" "1 kg" "200 ml" ...
##  $ packaging                                 : chr  "Bocal,Verre" "Plastic,Box" "Carton,plastique" "Tetra Brik" ...
##  $ packaging_tags                            : chr  "bocal,verre" "plastic,box" "carton,plastique" "tetra-brik" ...
##  $ brands                                    : chr  "Casino Délices" "Guylian" "Jacquot" "Valsoia,//Propiedad de://,Valsoia S.p.A." ...
##  $ brands_tags                               : chr  "casino-delices" "guylian" "jacquot" "valsoia,propiedad-de,valsoia-s-p-a" ...
##  $ categories                                : chr  "Aliments et boissons à base de végétaux,Aliments d'origine végétale,Aliments à base de fruits et de légu"| __truncated__ "Chocolate" "pâtes de fruits" "Alimentos y bebidas de origen vegetal,Alimentos de origen vegetal,Natas vegetales,Natas vegetales a base de soj"| __truncated__ ...
##  $ categories_tags                           : chr  "en:plant-based-foods-and-beverages,en:plant-based-foods,en:fruits-and-vegetables-based-foods,en:breakfasts,en:s"| __truncated__ "en:sugary-snacks,en:chocolates" "en:plant-based-foods-and-beverages,en:plant-based-foods,en:fruits-and-vegetables-based-foods,en:sugary-snacks,e"| __truncated__ "en:plant-based-foods-and-beverages,en:plant-based-foods,en:plant-based-creams,en:plant-based-creams-for-cooking"| __truncated__ ...
##  $ categories_en                             : chr  "Plant-based foods and beverages,Plant-based foods,Fruits and vegetables based foods,Breakfasts,Spreads,Fruits b"| __truncated__ "Sugary snacks,Chocolates" "Plant-based foods and beverages,Plant-based foods,Fruits and vegetables based foods,Sugary snacks,Confectioneri"| __truncated__ "Plant-based foods and beverages,Plant-based foods,Plant-based creams,Plant-based creams for cooking,Soy-based c"| __truncated__ ...
##  $ origins                                   : chr  "" "" "" "" ...
##  $ origins_tags                              : chr  "" "" "" "" ...
##  $ manufacturing_places                      : chr  "France" "Belgium" "" "Italia" ...
##  $ manufacturing_places_tags                 : chr  "france" "belgium" "" "italia" ...
##  $ labels                                    : chr  "" "" "" "Vegetariano,Vegano,Sin gluten,Sin OMG,Sin lactosa" ...
##  $ labels_tags                               : chr  "" "" "" "en:vegetarian,en:vegan,en:gluten-free,en:no-gmos,en:no-lactose" ...
##  $ labels_en                                 : chr  "" "" "" "Vegetarian,Vegan,Gluten-free,No GMOs,No lactose" ...
##  $ emb_codes                                 : chr  "EMB 78015" "" "" "" ...
##  $ emb_codes_tags                            : chr  "emb-78015" "" "" "" ...
##  $ first_packaging_code_geo                  : chr  "48.983333,2.066667" "" "" "" ...
##  $ cities                                    : logi  NA NA NA NA NA NA ...
##  $ cities_tags                               : chr  "andresy-yvelines-france" "" "" "" ...
##  $ purchase_places                           : chr  "Lyon,France" "NSW,Australia" "France" "Madrid,España" ...
##  $ stores                                    : chr  "Casino" "" "" "El Corte Inglés" ...
##  $ countries                                 : chr  "France" "Australia" "France" "España" ...
##  $ countries_tags                            : chr  "en:france" "en:australia" "en:france" "en:spain" ...
##  $ countries_en                              : chr  "France" "Australia" "France" "Spain" ...
##  $ ingredients_text                          : chr  "Sucre de canne, fraises 40 g, fraises des bois 14 g, gélifiant : pectines de fruits, jus de citron concentré."| __truncated__ "" "Pulpe de pommes 50% , sucre, sirop de glucose, gélifiant : pectine, acidifiant : acide citrique, arômes, colo"| __truncated__ "Extracto de soja (78%) (agua, semillas de soja 8,3%), grasas vegetales, jarabe de glucosa, dextrosa, emulsionan"| __truncated__ ...
##  $ allergens                                 : chr  "" "" "" "" ...
##  $ allergens_en                              : logi  NA NA NA NA NA NA ...
##  $ traces                                    : chr  "Lait,Fruits à coque" "" "" "" ...
##  $ traces_tags                               : chr  "en:milk,en:nuts" "" "" "" ...
##  $ traces_en                                 : chr  "Milk,Nuts" "" "" "" ...
##  $ serving_size                              : chr  "15 g" "" "" "" ...
##  $ no_nutriments                             : logi  NA NA NA NA NA NA ...
##  $ additives_n                               : int  1 NA 2 5 0 NA NA 0 NA 1 ...
##  $ additives                                 : chr  "[ sucre-de-canne -> fr:sucre-de-canne  ]  [ sucre-de -> fr:sucre-de  ]  [ sucre -> fr:sucre  ]  [ fraises-40-g "| __truncated__ "" "[ pulpe-de-pommes-50 -> fr:pulpe-de-pommes-50  ]  [ pulpe-de-pommes -> fr:pulpe-de-pommes  ]  [ pulpe-de -> fr:"| __truncated__ "[ extracto-de-soja -> es:extracto-de-soja  ]  [ 78 -> es:78  ]  [ agua -> es:agua  ]  [ semillas-de-soja-8 -> e"| __truncated__ ...
##  $ additives_tags                            : chr  "en:e440" "" "en:e440,en:e330" "en:e471,en:e415,en:e407,en:e412,en:e306" ...
##  $ additives_en                              : chr  "E440 - Pectins" "" "E440 - Pectins,E330 - Citric acid" "E471 - Mono- and diglycerides of fatty acids,E415 - Xanthan gum,E407 - Carrageenan,E412 - Guar gum,E306 - Tocop"| __truncated__ ...
##  $ ingredients_from_palm_oil_n               : int  0 NA 0 0 0 NA NA 0 NA 0 ...
##  $ ingredients_from_palm_oil                 : logi  NA NA NA NA NA NA ...
##  $ ingredients_from_palm_oil_tags            : chr  "" "" "" "" ...
##  $ ingredients_that_may_be_from_palm_oil_n   : int  0 NA 0 1 0 NA NA 0 NA 0 ...
##  $ ingredients_that_may_be_from_palm_oil     : logi  NA NA NA NA NA NA ...
##  $ ingredients_that_may_be_from_palm_oil_tags: chr  "" "" "" "e471-mono-et-diglycerides-d-acides-gras-alimentaires" ...
##  $ nutrition_grade_uk                        : logi  NA NA NA NA NA NA ...
##  $ nutrition_grade_fr                        : chr  "d" "" "" "d" ...
##  $ pnns_groups_1                             : chr  "Sugary snacks" "Sugary snacks" "Fruits and vegetables" "unknown" ...
##  $ pnns_groups_2                             : chr  "Sweets" "Chocolate products" "Fruits" "unknown" ...
##  $ states                                    : chr  "en:to-be-checked, en:complete, en:nutrition-facts-completed, en:ingredients-completed, en:expiration-date-to-be"| __truncated__ "en:to-be-completed, en:nutrition-facts-to-be-completed, en:ingredients-to-be-completed, en:expiration-date-to-b"| __truncated__ "en:to-be-checked, en:complete, en:nutrition-facts-completed, en:ingredients-completed, en:expiration-date-to-be"| __truncated__ "en:to-be-checked, en:complete, en:nutrition-facts-completed, en:ingredients-completed, en:expiration-date-compl"| __truncated__ ...
##  $ states_tags                               : chr  "en:to-be-checked,en:complete,en:nutrition-facts-completed,en:ingredients-completed,en:expiration-date-to-be-com"| __truncated__ "en:to-be-completed,en:nutrition-facts-to-be-completed,en:ingredients-to-be-completed,en:expiration-date-to-be-c"| __truncated__ "en:to-be-checked,en:complete,en:nutrition-facts-completed,en:ingredients-completed,en:expiration-date-to-be-com"| __truncated__ "en:to-be-checked,en:complete,en:nutrition-facts-completed,en:ingredients-completed,en:expiration-date-completed"| __truncated__ ...
##  $ states_en                                 : chr  "To be checked,Complete,Nutrition facts completed,Ingredients completed,Expiration date to be completed,Characte"| __truncated__ "To be completed,Nutrition facts to be completed,Ingredients to be completed,Expiration date to be completed,Cha"| __truncated__ "To be checked,Complete,Nutrition facts completed,Ingredients completed,Expiration date to be completed,Characte"| __truncated__ "To be checked,Complete,Nutrition facts completed,Ingredients completed,Expiration date completed,Characteristic"| __truncated__ ...
##  $ main_category                             : chr  "en:plant-based-foods-and-beverages" "en:sugary-snacks" "en:plant-based-foods-and-beverages" "en:plant-based-foods-and-beverages" ...
##  $ main_category_en                          : chr  "Plant-based foods and beverages" "Sugary snacks" "Plant-based foods and beverages" "Plant-based foods and beverages" ...
##  $ image_url                                 : chr  "http://en.openfoodfacts.org/images/products/322/247/574/5867/front.8.400.jpg" "http://en.openfoodfacts.org/images/products/541/097/688/0110/front.7.400.jpg" "http://en.openfoodfacts.org/images/products/326/475/042/3503/front.6.400.jpg" "http://en.openfoodfacts.org/images/products/800/604/024/7001/front.7.400.jpg" ...
##  $ image_small_url                           : chr  "http://en.openfoodfacts.org/images/products/322/247/574/5867/front.8.200.jpg" "http://en.openfoodfacts.org/images/products/541/097/688/0110/front.7.200.jpg" "http://en.openfoodfacts.org/images/products/326/475/042/3503/front.6.200.jpg" "http://en.openfoodfacts.org/images/products/800/604/024/7001/front.7.200.jpg" ...
##  $ energy_100g                               : num  918 NA NA 766 2359 ...
##  $ energy_from_fat_100g                      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ fat_100g                                  : num  0 NA NA 16.7 45.5 NA NA 25 NA 4 ...
##  $ saturated_fat_100g                        : num  0 NA NA 9.9 5.2 NA NA 17 NA 0.54 ...
##  $ butyric_acid_100g                         : logi  NA NA NA NA NA NA ...
##  $ caproic_acid_100g                         : logi  NA NA NA NA NA NA ...
##  $ caprylic_acid_100g                        : logi  NA NA NA NA NA NA ...
##  $ capric_acid_100g                          : logi  NA NA NA NA NA NA ...
##  $ lauric_acid_100g                          : logi  NA NA NA NA NA NA ...
##  $ myristic_acid_100g                        : logi  NA NA NA NA NA NA ...
##  $ palmitic_acid_100g                        : logi  NA NA NA NA NA NA ...
##  $ stearic_acid_100g                         : logi  NA NA NA NA NA NA ...
##  $ arachidic_acid_100g                       : logi  NA NA NA NA NA NA ...
##  $ behenic_acid_100g                         : logi  NA NA NA NA NA NA ...
##  $ lignoceric_acid_100g                      : logi  NA NA NA NA NA NA ...
##  $ cerotic_acid_100g                         : logi  NA NA NA NA NA NA ...
##  $ montanic_acid_100g                        : logi  NA NA NA NA NA NA ...
##  $ melissic_acid_100g                        : logi  NA NA NA NA NA NA ...
##  $ monounsaturated_fat_100g                  : num  NA NA NA 2.9 9.5 NA NA NA NA NA ...
##  $ polyunsaturated_fat_100g                  : num  NA NA NA 3.9 32.8 NA NA NA NA NA ...
##  $ omega_3_fat_100g                          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ alpha_linolenic_acid_100g                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ eicosapentaenoic_acid_100g                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ docosahexaenoic_acid_100g                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ omega_6_fat_100g                          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ linoleic_acid_100g                        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ arachidonic_acid_100g                     : logi  NA NA NA NA NA NA ...
##  $ gamma_linolenic_acid_100g                 : logi  NA NA NA NA NA NA ...
##  $ dihomo_gamma_linolenic_acid_100g          : logi  NA NA NA NA NA NA ...
##  $ omega_9_fat_100g                          : logi  NA NA NA NA NA NA ...
##  $ oleic_acid_100g                           : logi  NA NA NA NA NA NA ...
##  $ elaidic_acid_100g                         : logi  NA NA NA NA NA NA ...
##  $ gondoic_acid_100g                         : logi  NA NA NA NA NA NA ...
##  $ mead_acid_100g                            : logi  NA NA NA NA NA NA ...
##  $ erucic_acid_100g                          : logi  NA NA NA NA NA NA ...
##   [list output truncated]

Ex3: Inspecting variables

# library(dplyr)
glimpse(food) # formats information in a more approachable way
## Observations: 1,500
## Variables: 160
## $ V1                                         <int> 1, 2, 3, 4, 5, 6, 7...
## $ code                                       <int> 100030, 100050, 100...
## $ url                                        <chr> "http://world-en.op...
## $ creator                                    <chr> "sebleouf", "foodor...
## $ created_t                                  <int> 1424747544, 1450316...
## $ created_datetime                           <chr> "2015-02-24T03:12:2...
## $ last_modified_t                            <int> 1438445887, 1450817...
## $ last_modified_datetime                     <chr> "2015-08-01T16:18:0...
## $ product_name                               <chr> "Confiture de frais...
## $ generic_name                               <chr> "", "", "Pâtes de ...
## $ quantity                                   <chr> "265 g", "375g", "1...
## $ packaging                                  <chr> "Bocal,Verre", "Pla...
## $ packaging_tags                             <chr> "bocal,verre", "pla...
## $ brands                                     <chr> "Casino Délices", ...
## $ brands_tags                                <chr> "casino-delices", "...
## $ categories                                 <chr> "Aliments et boisso...
## $ categories_tags                            <chr> "en:plant-based-foo...
## $ categories_en                              <chr> "Plant-based foods ...
## $ origins                                    <chr> "", "", "", "", "Ar...
## $ origins_tags                               <chr> "", "", "", "", "ar...
## $ manufacturing_places                       <chr> "France", "Belgium"...
## $ manufacturing_places_tags                  <chr> "france", "belgium"...
## $ labels                                     <chr> "", "", "", "Vegeta...
## $ labels_tags                                <chr> "", "", "", "en:veg...
## $ labels_en                                  <chr> "", "", "", "Vegeta...
## $ emb_codes                                  <chr> "EMB 78015", "", ""...
## $ emb_codes_tags                             <chr> "emb-78015", "", ""...
## $ first_packaging_code_geo                   <chr> "48.983333,2.066667...
## $ cities                                     <lgl> NA, NA, NA, NA, NA,...
## $ cities_tags                                <chr> "andresy-yvelines-f...
## $ purchase_places                            <chr> "Lyon,France", "NSW...
## $ stores                                     <chr> "Casino", "", "", "...
## $ countries                                  <chr> "France", "Australi...
## $ countries_tags                             <chr> "en:france", "en:au...
## $ countries_en                               <chr> "France", "Australi...
## $ ingredients_text                           <chr> "Sucre de canne, fr...
## $ allergens                                  <chr> "", "", "", "", "",...
## $ allergens_en                               <lgl> NA, NA, NA, NA, NA,...
## $ traces                                     <chr> "Lait,Fruits à coq...
## $ traces_tags                                <chr> "en:milk,en:nuts", ...
## $ traces_en                                  <chr> "Milk,Nuts", "", ""...
## $ serving_size                               <chr> "15 g", "", "", "",...
## $ no_nutriments                              <lgl> NA, NA, NA, NA, NA,...
## $ additives_n                                <int> 1, NA, 2, 5, 0, NA,...
## $ additives                                  <chr> "[ sucre-de-canne -...
## $ additives_tags                             <chr> "en:e440", "", "en:...
## $ additives_en                               <chr> "E440 - Pectins", "...
## $ ingredients_from_palm_oil_n                <int> 0, NA, 0, 0, 0, NA,...
## $ ingredients_from_palm_oil                  <lgl> NA, NA, NA, NA, NA,...
## $ ingredients_from_palm_oil_tags             <chr> "", "", "", "", "",...
## $ ingredients_that_may_be_from_palm_oil_n    <int> 0, NA, 0, 1, 0, NA,...
## $ ingredients_that_may_be_from_palm_oil      <lgl> NA, NA, NA, NA, NA,...
## $ ingredients_that_may_be_from_palm_oil_tags <chr> "", "", "", "e471-m...
## $ nutrition_grade_uk                         <lgl> NA, NA, NA, NA, NA,...
## $ nutrition_grade_fr                         <chr> "d", "", "", "d", "...
## $ pnns_groups_1                              <chr> "Sugary snacks", "S...
## $ pnns_groups_2                              <chr> "Sweets", "Chocolat...
## $ states                                     <chr> "en:to-be-checked, ...
## $ states_tags                                <chr> "en:to-be-checked,e...
## $ states_en                                  <chr> "To be checked,Comp...
## $ main_category                              <chr> "en:plant-based-foo...
## $ main_category_en                           <chr> "Plant-based foods ...
## $ image_url                                  <chr> "http://en.openfood...
## $ image_small_url                            <chr> "http://en.openfood...
## $ energy_100g                                <dbl> 918, NA, NA, 766, 2...
## $ energy_from_fat_100g                       <dbl> NA, NA, NA, NA, NA,...
## $ fat_100g                                   <dbl> 0.00, NA, NA, 16.70...
## $ saturated_fat_100g                         <dbl> 0.000, NA, NA, 9.90...
## $ butyric_acid_100g                          <lgl> NA, NA, NA, NA, NA,...
## $ caproic_acid_100g                          <lgl> NA, NA, NA, NA, NA,...
## $ caprylic_acid_100g                         <lgl> NA, NA, NA, NA, NA,...
## $ capric_acid_100g                           <lgl> NA, NA, NA, NA, NA,...
## $ lauric_acid_100g                           <lgl> NA, NA, NA, NA, NA,...
## $ myristic_acid_100g                         <lgl> NA, NA, NA, NA, NA,...
## $ palmitic_acid_100g                         <lgl> NA, NA, NA, NA, NA,...
## $ stearic_acid_100g                          <lgl> NA, NA, NA, NA, NA,...
## $ arachidic_acid_100g                        <lgl> NA, NA, NA, NA, NA,...
## $ behenic_acid_100g                          <lgl> NA, NA, NA, NA, NA,...
## $ lignoceric_acid_100g                       <lgl> NA, NA, NA, NA, NA,...
## $ cerotic_acid_100g                          <lgl> NA, NA, NA, NA, NA,...
## $ montanic_acid_100g                         <lgl> NA, NA, NA, NA, NA,...
## $ melissic_acid_100g                         <lgl> NA, NA, NA, NA, NA,...
## $ monounsaturated_fat_100g                   <dbl> NA, NA, NA, 2.9, 9....
## $ polyunsaturated_fat_100g                   <dbl> NA, NA, NA, 3.9, 32...
## $ omega_3_fat_100g                           <dbl> NA, NA, NA, NA, NA,...
## $ alpha_linolenic_acid_100g                  <dbl> NA, NA, NA, NA, NA,...
## $ eicosapentaenoic_acid_100g                 <dbl> NA, NA, NA, NA, NA,...
## $ docosahexaenoic_acid_100g                  <dbl> NA, NA, NA, NA, NA,...
## $ omega_6_fat_100g                           <dbl> NA, NA, NA, NA, NA,...
## $ linoleic_acid_100g                         <dbl> NA, NA, NA, NA, NA,...
## $ arachidonic_acid_100g                      <lgl> NA, NA, NA, NA, NA,...
## $ gamma_linolenic_acid_100g                  <lgl> NA, NA, NA, NA, NA,...
## $ dihomo_gamma_linolenic_acid_100g           <lgl> NA, NA, NA, NA, NA,...
## $ omega_9_fat_100g                           <lgl> NA, NA, NA, NA, NA,...
## $ oleic_acid_100g                            <lgl> NA, NA, NA, NA, NA,...
## $ elaidic_acid_100g                          <lgl> NA, NA, NA, NA, NA,...
## $ gondoic_acid_100g                          <lgl> NA, NA, NA, NA, NA,...
## $ mead_acid_100g                             <lgl> NA, NA, NA, NA, NA,...
## $ erucic_acid_100g                           <lgl> NA, NA, NA, NA, NA,...
## $ nervonic_acid_100g                         <lgl> NA, NA, NA, NA, NA,...
## $ trans_fat_100g                             <dbl> NA, NA, NA, NA, NA,...
## $ cholesterol_100g                           <dbl> NA, NA, NA, 0.00020...
## $ carbohydrates_100g                         <dbl> 54.00, NA, NA, 5.70...
## $ sugars_100g                                <dbl> 54.00, NA, NA, 4.20...
## $ sucrose_100g                               <lgl> NA, NA, NA, NA, NA,...
## $ glucose_100g                               <lgl> NA, NA, NA, NA, NA,...
## $ fructose_100g                              <int> NA, NA, NA, NA, NA,...
## $ lactose_100g                               <dbl> NA, NA, NA, NA, NA,...
## $ maltose_100g                               <lgl> NA, NA, NA, NA, NA,...
## $ maltodextrins_100g                         <lgl> NA, NA, NA, NA, NA,...
## $ starch_100g                                <dbl> NA, NA, NA, NA, NA,...
## $ polyols_100g                               <dbl> NA, NA, NA, NA, NA,...
## $ fiber_100g                                 <dbl> NA, NA, NA, 0.2, 9....
## $ proteins_100g                              <dbl> 0.00, NA, NA, 2.90,...
## $ casein_100g                                <dbl> NA, NA, NA, NA, NA,...
## $ serum_proteins_100g                        <lgl> NA, NA, NA, NA, NA,...
## $ nucleotides_100g                           <lgl> NA, NA, NA, NA, NA,...
## $ salt_100g                                  <dbl> 0.0000000, NA, NA, ...
## $ sodium_100g                                <dbl> 0.0000000, NA, NA, ...
## $ alcohol_100g                               <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_a_100g                             <dbl> NA, NA, NA, NA, NA,...
## $ beta_carotene_100g                         <lgl> NA, NA, NA, NA, NA,...
## $ vitamin_d_100g                             <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_e_100g                             <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_k_100g                             <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_c_100g                             <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_b1_100g                            <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_b2_100g                            <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_pp_100g                            <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_b6_100g                            <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_b9_100g                            <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_b12_100g                           <dbl> NA, NA, NA, NA, NA,...
## $ biotin_100g                                <dbl> NA, NA, NA, NA, NA,...
## $ pantothenic_acid_100g                      <dbl> NA, NA, NA, NA, NA,...
## $ silica_100g                                <dbl> NA, NA, NA, NA, NA,...
## $ bicarbonate_100g                           <dbl> NA, NA, NA, NA, NA,...
## $ potassium_100g                             <dbl> NA, NA, NA, NA, NA,...
## $ chloride_100g                              <dbl> NA, NA, NA, NA, NA,...
## $ calcium_100g                               <dbl> NA, NA, NA, NA, NA,...
## $ phosphorus_100g                            <dbl> NA, NA, NA, NA, 1.1...
## $ iron_100g                                  <dbl> NA, NA, NA, NA, 0.0...
## $ magnesium_100g                             <dbl> NA, NA, NA, NA, 0.1...
## $ zinc_100g                                  <dbl> NA, NA, NA, NA, NA,...
## $ copper_100g                                <dbl> NA, NA, NA, NA, NA,...
## $ manganese_100g                             <dbl> NA, NA, NA, NA, NA,...
## $ fluoride_100g                              <dbl> NA, NA, NA, NA, NA,...
## $ selenium_100g                              <dbl> NA, NA, NA, NA, NA,...
## $ chromium_100g                              <lgl> NA, NA, NA, NA, NA,...
## $ molybdenum_100g                            <lgl> NA, NA, NA, NA, NA,...
## $ iodine_100g                                <dbl> NA, NA, NA, NA, NA,...
## $ caffeine_100g                              <lgl> NA, NA, NA, NA, NA,...
## $ taurine_100g                               <lgl> NA, NA, NA, NA, NA,...
## $ ph_100g                                    <lgl> NA, NA, NA, NA, NA,...
## $ fruits_vegetables_nuts_100g                <dbl> 54, NA, NA, NA, NA,...
## $ collagen_meat_protein_ratio_100g           <int> NA, NA, NA, NA, NA,...
## $ cocoa_100g                                 <int> NA, NA, NA, NA, NA,...
## $ chlorophyl_100g                            <lgl> NA, NA, NA, NA, NA,...
## $ carbon_footprint_100g                      <dbl> NA, NA, NA, NA, NA,...
## $ nutrition_score_fr_100g                    <int> 11, NA, NA, 11, 17,...
## $ nutrition_score_uk_100g                    <int> 11, NA, NA, 11, 17,...
names(food) # column names -- (any pairs that might be duplicates?)
##   [1] "V1"                                        
##   [2] "code"                                      
##   [3] "url"                                       
##   [4] "creator"                                   
##   [5] "created_t"                                 
##   [6] "created_datetime"                          
##   [7] "last_modified_t"                           
##   [8] "last_modified_datetime"                    
##   [9] "product_name"                              
##  [10] "generic_name"                              
##  [11] "quantity"                                  
##  [12] "packaging"                                 
##  [13] "packaging_tags"                            
##  [14] "brands"                                    
##  [15] "brands_tags"                               
##  [16] "categories"                                
##  [17] "categories_tags"                           
##  [18] "categories_en"                             
##  [19] "origins"                                   
##  [20] "origins_tags"                              
##  [21] "manufacturing_places"                      
##  [22] "manufacturing_places_tags"                 
##  [23] "labels"                                    
##  [24] "labels_tags"                               
##  [25] "labels_en"                                 
##  [26] "emb_codes"                                 
##  [27] "emb_codes_tags"                            
##  [28] "first_packaging_code_geo"                  
##  [29] "cities"                                    
##  [30] "cities_tags"                               
##  [31] "purchase_places"                           
##  [32] "stores"                                    
##  [33] "countries"                                 
##  [34] "countries_tags"                            
##  [35] "countries_en"                              
##  [36] "ingredients_text"                          
##  [37] "allergens"                                 
##  [38] "allergens_en"                              
##  [39] "traces"                                    
##  [40] "traces_tags"                               
##  [41] "traces_en"                                 
##  [42] "serving_size"                              
##  [43] "no_nutriments"                             
##  [44] "additives_n"                               
##  [45] "additives"                                 
##  [46] "additives_tags"                            
##  [47] "additives_en"                              
##  [48] "ingredients_from_palm_oil_n"               
##  [49] "ingredients_from_palm_oil"                 
##  [50] "ingredients_from_palm_oil_tags"            
##  [51] "ingredients_that_may_be_from_palm_oil_n"   
##  [52] "ingredients_that_may_be_from_palm_oil"     
##  [53] "ingredients_that_may_be_from_palm_oil_tags"
##  [54] "nutrition_grade_uk"                        
##  [55] "nutrition_grade_fr"                        
##  [56] "pnns_groups_1"                             
##  [57] "pnns_groups_2"                             
##  [58] "states"                                    
##  [59] "states_tags"                               
##  [60] "states_en"                                 
##  [61] "main_category"                             
##  [62] "main_category_en"                          
##  [63] "image_url"                                 
##  [64] "image_small_url"                           
##  [65] "energy_100g"                               
##  [66] "energy_from_fat_100g"                      
##  [67] "fat_100g"                                  
##  [68] "saturated_fat_100g"                        
##  [69] "butyric_acid_100g"                         
##  [70] "caproic_acid_100g"                         
##  [71] "caprylic_acid_100g"                        
##  [72] "capric_acid_100g"                          
##  [73] "lauric_acid_100g"                          
##  [74] "myristic_acid_100g"                        
##  [75] "palmitic_acid_100g"                        
##  [76] "stearic_acid_100g"                         
##  [77] "arachidic_acid_100g"                       
##  [78] "behenic_acid_100g"                         
##  [79] "lignoceric_acid_100g"                      
##  [80] "cerotic_acid_100g"                         
##  [81] "montanic_acid_100g"                        
##  [82] "melissic_acid_100g"                        
##  [83] "monounsaturated_fat_100g"                  
##  [84] "polyunsaturated_fat_100g"                  
##  [85] "omega_3_fat_100g"                          
##  [86] "alpha_linolenic_acid_100g"                 
##  [87] "eicosapentaenoic_acid_100g"                
##  [88] "docosahexaenoic_acid_100g"                 
##  [89] "omega_6_fat_100g"                          
##  [90] "linoleic_acid_100g"                        
##  [91] "arachidonic_acid_100g"                     
##  [92] "gamma_linolenic_acid_100g"                 
##  [93] "dihomo_gamma_linolenic_acid_100g"          
##  [94] "omega_9_fat_100g"                          
##  [95] "oleic_acid_100g"                           
##  [96] "elaidic_acid_100g"                         
##  [97] "gondoic_acid_100g"                         
##  [98] "mead_acid_100g"                            
##  [99] "erucic_acid_100g"                          
## [100] "nervonic_acid_100g"                        
## [101] "trans_fat_100g"                            
## [102] "cholesterol_100g"                          
## [103] "carbohydrates_100g"                        
## [104] "sugars_100g"                               
## [105] "sucrose_100g"                              
## [106] "glucose_100g"                              
## [107] "fructose_100g"                             
## [108] "lactose_100g"                              
## [109] "maltose_100g"                              
## [110] "maltodextrins_100g"                        
## [111] "starch_100g"                               
## [112] "polyols_100g"                              
## [113] "fiber_100g"                                
## [114] "proteins_100g"                             
## [115] "casein_100g"                               
## [116] "serum_proteins_100g"                       
## [117] "nucleotides_100g"                          
## [118] "salt_100g"                                 
## [119] "sodium_100g"                               
## [120] "alcohol_100g"                              
## [121] "vitamin_a_100g"                            
## [122] "beta_carotene_100g"                        
## [123] "vitamin_d_100g"                            
## [124] "vitamin_e_100g"                            
## [125] "vitamin_k_100g"                            
## [126] "vitamin_c_100g"                            
## [127] "vitamin_b1_100g"                           
## [128] "vitamin_b2_100g"                           
## [129] "vitamin_pp_100g"                           
## [130] "vitamin_b6_100g"                           
## [131] "vitamin_b9_100g"                           
## [132] "vitamin_b12_100g"                          
## [133] "biotin_100g"                               
## [134] "pantothenic_acid_100g"                     
## [135] "silica_100g"                               
## [136] "bicarbonate_100g"                          
## [137] "potassium_100g"                            
## [138] "chloride_100g"                             
## [139] "calcium_100g"                              
## [140] "phosphorus_100g"                           
## [141] "iron_100g"                                 
## [142] "magnesium_100g"                            
## [143] "zinc_100g"                                 
## [144] "copper_100g"                               
## [145] "manganese_100g"                            
## [146] "fluoride_100g"                             
## [147] "selenium_100g"                             
## [148] "chromium_100g"                             
## [149] "molybdenum_100g"                           
## [150] "iodine_100g"                               
## [151] "caffeine_100g"                             
## [152] "taurine_100g"                              
## [153] "ph_100g"                                   
## [154] "fruits_vegetables_nuts_100g"               
## [155] "collagen_meat_protein_ratio_100g"          
## [156] "cocoa_100g"                                
## [157] "chlorophyl_100g"                           
## [158] "carbon_footprint_100g"                     
## [159] "nutrition_score_fr_100g"                   
## [160] "nutrition_score_uk_100g"

Ex4: Remove Duplicates

  • some information on what and when information was added (1:9)
  • meta information about food (10:17, 22:27)
  • where it came from (18:21, 28:34)
  • what it’s made of (35:52)
  • nutrition grades (53:54)
  • some unclear (55:63)
  • some nutritional information (64:159)
# Define vector of duplicate cols
duplicates <- c(4, 6, 11, 13, 15, 17, 18, 20, 22, 
                24, 25, 28, 32, 34, 36, 38, 40, 
                44, 46, 48, 51, 54, 65, 158)

# Remove duplicates from food: food2
food2 <- food[,-duplicates]
  

Ex5: Removing useless info

  • For example
    • first few columns contain internal codes that don’t have any meaning to us
    • some column names that aren’t clear enough to tell what they contain.
# Define useless vector
useless <- c(1, 2, 3, 32:41)

# Remove useless columns from food2: food3
food3 <- food2[,-useless]
  

Ex6: Finding columns

All of the columns with nutrition info contain the character string “100g” as part of their name, which makes it easy to identify them.

# library(stringr)

# Create vector of column indices: nutrition
nutrition <- str_detect(names(food3), "100g")

# View a summary of nutrition columns
summary(nutrition) # this is not necessary - duh.
##    Mode   FALSE    TRUE 
## logical      29      94
summary(food3[, nutrition])
##  energy_from_fat_100g    fat_100g      saturated_fat_100g
##  Min.   :   0.00      Min.   :  0.00   Min.   : 0.000    
##  1st Qu.:  35.98      1st Qu.:  0.90   1st Qu.: 0.200    
##  Median : 237.00      Median :  6.00   Median : 1.700    
##  Mean   : 668.41      Mean   : 13.39   Mean   : 4.874    
##  3rd Qu.: 974.00      3rd Qu.: 20.00   3rd Qu.: 6.500    
##  Max.   :2900.00      Max.   :100.00   Max.   :57.000    
##  NA's   :1486         NA's   :708      NA's   :797       
##  butyric_acid_100g caproic_acid_100g caprylic_acid_100g capric_acid_100g
##  Mode:logical      Mode:logical      Mode:logical       Mode:logical    
##  NA's:1500         NA's:1500         NA's:1500          NA's:1500       
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##  lauric_acid_100g myristic_acid_100g palmitic_acid_100g stearic_acid_100g
##  Mode:logical     Mode:logical       Mode:logical       Mode:logical     
##  NA's:1500        NA's:1500          NA's:1500          NA's:1500        
##                                                                          
##                                                                          
##                                                                          
##                                                                          
##                                                                          
##  arachidic_acid_100g behenic_acid_100g lignoceric_acid_100g
##  Mode:logical        Mode:logical      Mode:logical        
##  NA's:1500           NA's:1500         NA's:1500           
##                                                            
##                                                            
##                                                            
##                                                            
##                                                            
##  cerotic_acid_100g montanic_acid_100g melissic_acid_100g
##  Mode:logical      Mode:logical       Mode:logical      
##  NA's:1500         NA's:1500          NA's:1500         
##                                                         
##                                                         
##                                                         
##                                                         
##                                                         
##  monounsaturated_fat_100g polyunsaturated_fat_100g omega_3_fat_100g
##  Min.   : 0.00            Min.   : 0.400           Min.   : 0.033  
##  1st Qu.: 3.87            1st Qu.: 1.653           1st Qu.: 1.300  
##  Median : 9.50            Median : 3.900           Median : 3.000  
##  Mean   :19.77            Mean   : 9.986           Mean   : 3.726  
##  3rd Qu.:29.00            3rd Qu.:12.700           3rd Qu.: 3.200  
##  Max.   :75.00            Max.   :46.200           Max.   :12.400  
##  NA's   :1465             NA's   :1464             NA's   :1491    
##  alpha_linolenic_acid_100g eicosapentaenoic_acid_100g
##  Min.   :0.0800            Min.   :0.721             
##  1st Qu.:0.0905            1st Qu.:0.721             
##  Median :0.1010            Median :0.721             
##  Mean   :0.1737            Mean   :0.721             
##  3rd Qu.:0.2205            3rd Qu.:0.721             
##  Max.   :0.3400            Max.   :0.721             
##  NA's   :1497              NA's   :1499              
##  docosahexaenoic_acid_100g omega_6_fat_100g linoleic_acid_100g
##  Min.   :1.09              Min.   :0.25     Min.   :0.5000    
##  1st Qu.:1.09              1st Qu.:0.25     1st Qu.:0.5165    
##  Median :1.09              Median :0.25     Median :0.5330    
##  Mean   :1.09              Mean   :0.25     Mean   :0.5330    
##  3rd Qu.:1.09              3rd Qu.:0.25     3rd Qu.:0.5495    
##  Max.   :1.09              Max.   :0.25     Max.   :0.5660    
##  NA's   :1499              NA's   :1499     NA's   :1498      
##  arachidonic_acid_100g gamma_linolenic_acid_100g
##  Mode:logical          Mode:logical             
##  NA's:1500             NA's:1500                
##                                                 
##                                                 
##                                                 
##                                                 
##                                                 
##  dihomo_gamma_linolenic_acid_100g omega_9_fat_100g oleic_acid_100g
##  Mode:logical                     Mode:logical     Mode:logical   
##  NA's:1500                        NA's:1500        NA's:1500      
##                                                                   
##                                                                   
##                                                                   
##                                                                   
##                                                                   
##  elaidic_acid_100g gondoic_acid_100g mead_acid_100g erucic_acid_100g
##  Mode:logical      Mode:logical      Mode:logical   Mode:logical    
##  NA's:1500         NA's:1500         NA's:1500      NA's:1500       
##                                                                     
##                                                                     
##                                                                     
##                                                                     
##                                                                     
##  nervonic_acid_100g trans_fat_100g   cholesterol_100g carbohydrates_100g
##  Mode:logical       Min.   :0.0000   Min.   :0.0000   Min.   :  0.000   
##  NA's:1500          1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:  3.792   
##                     Median :0.0000   Median :0.0000   Median : 13.500   
##                     Mean   :0.0105   Mean   :0.0265   Mean   : 27.958   
##                     3rd Qu.:0.0000   3rd Qu.:0.0026   3rd Qu.: 55.000   
##                     Max.   :0.1000   Max.   :0.4300   Max.   :100.000   
##                     NA's   :1481     NA's   :1477     NA's   :708       
##   sugars_100g     sucrose_100g   glucose_100g   fructose_100g 
##  Min.   :  0.00   Mode:logical   Mode:logical   Min.   :100   
##  1st Qu.:  1.00   NA's:1500      NA's:1500      1st Qu.:100   
##  Median :  4.05                                 Median :100   
##  Mean   : 12.66                                 Mean   :100   
##  3rd Qu.: 14.70                                 3rd Qu.:100   
##  Max.   :100.00                                 Max.   :100   
##  NA's   :788                                    NA's   :1499  
##   lactose_100g   maltose_100g   maltodextrins_100g  starch_100g   
##  Min.   :0.000   Mode:logical   Mode:logical       Min.   : 0.00  
##  1st Qu.:0.250   NA's:1500      NA's:1500          1st Qu.: 9.45  
##  Median :0.500                                     Median :39.50  
##  Mean   :2.933                                     Mean   :30.73  
##  3rd Qu.:4.400                                     3rd Qu.:42.85  
##  Max.   :8.300                                     Max.   :71.00  
##  NA's   :1497                                      NA's   :1493   
##   polyols_100g     fiber_100g     proteins_100g     casein_100g  
##  Min.   : 8.60   Min.   : 0.000   Min.   : 0.000   Min.   :1.1   
##  1st Qu.:59.10   1st Qu.: 0.500   1st Qu.: 1.500   1st Qu.:1.1   
##  Median :67.00   Median : 1.750   Median : 6.000   Median :1.1   
##  Mean   :56.06   Mean   : 2.823   Mean   : 7.563   Mean   :1.1   
##  3rd Qu.:69.80   3rd Qu.: 3.500   3rd Qu.:10.675   3rd Qu.:1.1   
##  Max.   :70.00   Max.   :46.700   Max.   :61.000   Max.   :1.1   
##  NA's   :1491    NA's   :994      NA's   :710      NA's   :1499  
##  serum_proteins_100g nucleotides_100g   salt_100g         sodium_100g     
##  Mode:logical        Mode:logical     Min.   :  0.0000   Min.   : 0.0000  
##  NA's:1500           NA's:1500        1st Qu.:  0.0438   1st Qu.: 0.0172  
##                                       Median :  0.4498   Median : 0.1771  
##                                       Mean   :  1.1205   Mean   : 0.4409  
##                                       3rd Qu.:  1.1938   3rd Qu.: 0.4700  
##                                       Max.   :102.0000   Max.   :40.0000  
##                                       NA's   :780        NA's   :780      
##   alcohol_100g   vitamin_a_100g   beta_carotene_100g vitamin_d_100g 
##  Min.   : 0.00   Min.   :0.0000   Mode:logical       Min.   :0e+00  
##  1st Qu.: 0.00   1st Qu.:0.0000   NA's:1500          1st Qu.:0e+00  
##  Median : 5.50   Median :0.0001                      Median :0e+00  
##  Mean   :10.07   Mean   :0.0003                      Mean   :0e+00  
##  3rd Qu.:13.00   3rd Qu.:0.0006                      3rd Qu.:0e+00  
##  Max.   :50.00   Max.   :0.0013                      Max.   :1e-04  
##  NA's   :1433    NA's   :1477                        NA's   :1485   
##  vitamin_e_100g   vitamin_k_100g vitamin_c_100g  vitamin_b1_100g 
##  Min.   :0.0005   Min.   :0      Min.   :0.000   Min.   :0.0001  
##  1st Qu.:0.0021   1st Qu.:0      1st Qu.:0.002   1st Qu.:0.0003  
##  Median :0.0044   Median :0      Median :0.019   Median :0.0004  
##  Mean   :0.0069   Mean   :0      Mean   :0.025   Mean   :0.0006  
##  3rd Qu.:0.0097   3rd Qu.:0      3rd Qu.:0.030   3rd Qu.:0.0010  
##  Max.   :0.0320   Max.   :0      Max.   :0.217   Max.   :0.0013  
##  NA's   :1478     NA's   :1498   NA's   :1459    NA's   :1478    
##  vitamin_b2_100g  vitamin_pp_100g  vitamin_b6_100g  vitamin_b9_100g
##  Min.   :0.0002   Min.   :0.0006   Min.   :0.0001   Min.   :0e+00  
##  1st Qu.:0.0003   1st Qu.:0.0033   1st Qu.:0.0002   1st Qu.:0e+00  
##  Median :0.0009   Median :0.0069   Median :0.0008   Median :1e-04  
##  Mean   :0.0011   Mean   :0.0086   Mean   :0.0112   Mean   :1e-04  
##  3rd Qu.:0.0013   3rd Qu.:0.0140   3rd Qu.:0.0012   3rd Qu.:2e-04  
##  Max.   :0.0066   Max.   :0.0160   Max.   :0.2000   Max.   :2e-04  
##  NA's   :1483     NA's   :1484     NA's   :1481     NA's   :1483   
##  vitamin_b12_100g  biotin_100g   pantothenic_acid_100g  silica_100g   
##  Min.   :0        Min.   :0      Min.   :0.0000        Min.   :8e-04  
##  1st Qu.:0        1st Qu.:0      1st Qu.:0.0007        1st Qu.:8e-04  
##  Median :0        Median :0      Median :0.0020        Median :8e-04  
##  Mean   :0        Mean   :0      Mean   :0.0027        Mean   :8e-04  
##  3rd Qu.:0        3rd Qu.:0      3rd Qu.:0.0051        3rd Qu.:8e-04  
##  Max.   :0        Max.   :0      Max.   :0.0060        Max.   :8e-04  
##  NA's   :1489     NA's   :1498   NA's   :1486          NA's   :1499   
##  bicarbonate_100g potassium_100g   chloride_100g     calcium_100g   
##  Min.   :0.0006   Min.   :0.0000   Min.   :0.0003   Min.   :0.0000  
##  1st Qu.:0.0678   1st Qu.:0.0650   1st Qu.:0.0006   1st Qu.:0.0450  
##  Median :0.1350   Median :0.1940   Median :0.0009   Median :0.1200  
##  Mean   :0.1692   Mean   :0.3288   Mean   :0.0144   Mean   :0.2040  
##  3rd Qu.:0.2535   3rd Qu.:0.3670   3rd Qu.:0.0214   3rd Qu.:0.1985  
##  Max.   :0.3720   Max.   :1.4300   Max.   :0.0420   Max.   :1.0000  
##  NA's   :1497     NA's   :1487     NA's   :1497     NA's   :1449    
##  phosphorus_100g    iron_100g      magnesium_100g     zinc_100g     
##  Min.   :0.0430   Min.   :0.0000   Min.   :0.0000   Min.   :0.0005  
##  1st Qu.:0.1938   1st Qu.:0.0012   1st Qu.:0.0670   1st Qu.:0.0009  
##  Median :0.3185   Median :0.0042   Median :0.1040   Median :0.0017  
##  Mean   :0.3777   Mean   :0.0045   Mean   :0.1066   Mean   :0.0016  
##  3rd Qu.:0.4340   3rd Qu.:0.0077   3rd Qu.:0.1300   3rd Qu.:0.0022  
##  Max.   :1.1550   Max.   :0.0137   Max.   :0.3330   Max.   :0.0026  
##  NA's   :1488     NA's   :1463     NA's   :1479     NA's   :1493    
##   copper_100g    manganese_100g fluoride_100g  selenium_100g 
##  Min.   :0e+00   Min.   :0      Min.   :0      Min.   :0     
##  1st Qu.:1e-04   1st Qu.:0      1st Qu.:0      1st Qu.:0     
##  Median :1e-04   Median :0      Median :0      Median :0     
##  Mean   :1e-04   Mean   :0      Mean   :0      Mean   :0     
##  3rd Qu.:1e-04   3rd Qu.:0      3rd Qu.:0      3rd Qu.:0     
##  Max.   :1e-04   Max.   :0      Max.   :0      Max.   :0     
##  NA's   :1498    NA's   :1499   NA's   :1498   NA's   :1499  
##  chromium_100g  molybdenum_100g  iodine_100g   caffeine_100g 
##  Mode:logical   Mode:logical    Min.   :0      Mode:logical  
##  NA's:1500      NA's:1500       1st Qu.:0      NA's:1500     
##                                 Median :0                    
##                                 Mean   :0                    
##                                 3rd Qu.:0                    
##                                 Max.   :0                    
##                                 NA's   :1499                 
##  taurine_100g   ph_100g        fruits_vegetables_nuts_100g
##  Mode:logical   Mode:logical   Min.   : 2.00              
##  NA's:1500      NA's:1500      1st Qu.:11.25              
##                                Median :42.00              
##                                Mean   :36.88              
##                                3rd Qu.:52.25              
##                                Max.   :80.00              
##                                NA's   :1470               
##  collagen_meat_protein_ratio_100g   cocoa_100g   chlorophyl_100g
##  Min.   :12.00                    Min.   :30     Mode:logical   
##  1st Qu.:13.50                    1st Qu.:47     NA's:1500      
##  Median :15.00                    Median :60                    
##  Mean   :15.67                    Mean   :57                    
##  3rd Qu.:17.50                    3rd Qu.:70                    
##  Max.   :20.00                    Max.   :81                    
##  NA's   :1497                     NA's   :1491                  
##  nutrition_score_fr_100g nutrition_score_uk_100g
##  Min.   :-12.000         Min.   :-12.000        
##  1st Qu.:  1.000         1st Qu.:  0.000        
##  Median :  7.000         Median :  6.000        
##  Mean   :  7.941         Mean   :  7.631        
##  3rd Qu.: 15.000         3rd Qu.: 16.000        
##  Max.   : 28.000         Max.   : 28.000        
##  NA's   :825             NA's   :825

Ex7: Replacing missing values

summary revealed that the nutrition data are mostly NA values Reason: food just doesn’t have those nutrients

sugar content, zero values are sometimes entered explicitly, but sometimes the values are just left empty to denote a zero


# Find indices of sugar NA values: missing
# in the sugars_100g column
missing <- is.na(food3$sugars_100g)

# Replace NA values with 0
# Subset food3$sugars_100g using missing to replace the missing values with zeros
food3$sugars_100g[missing] <- 0

# Create first histogram
hist(food3$sugars_100g, breaks = 100)


# Create food4
# exclude the observations which have no sugar
# subset of food3 that excludes the foods with zero sugar
food4 <- food3[food3$sugars_100g != 0, ]

# Create second histogram
# see how the distribution changes
hist(food4$sugars_100g, breaks = 100)

Ex8: messy data

how many of these foods come in some sort of plastic packaging?

packaging info in dataset = stored in several different languages (Spanish, French, and English).

root word for plastic is same: * English (plastic) * French (plastique) * Spanish (plastico)

# library(stringr)

# look through the packaging column for the string "plasti".
# Find entries containing "plasti": plastic
plastic <- str_detect(food3$packaging,"plasti")

# count how many of the foods are packaged in plastic.
# Print the sum of plastic
sum(plastic)
## [1] 232

School Attendance

attendance data from public schools in the US, organized by school level and state, during the 2007-2008 academic year

average daily attendance (ADA) as a percentage of total enrollment, school day length, and school year length.

Ex1: Import Data

Load the gdata package

# install.packages("gdata")
library(gdata)

Import the spreadsheet


url_att = "http://s3.amazonaws.com/assets.datacamp.com/production/course_1294/datasets/attendance.xls"

# Error in findPerl(verbose = verbose): perl executable not found.
# Use perl= argument to specify the correct path.
# Error in file.exists(tfn) : invalid 'file' argument
# => requires perl arguement "C:\\Perl\\bin\\perl.exe")
# https://www.activestate.com/activeperl/downloads

# att <- read.xls(url_att, perl = "C:\\Perl64\\bin\\perl.exe")
# this function is part of an archieved CRAN repository
# https://cran.r-project.org/web/packages/xlsReadWrite/
# found here also:
# https://www.rdocumentation.org/packages/gdata/versions/2.18.0/topics/read.xls

# does not work!?
# download.file(url_att, "C:/Temp/attendance.xls")

library(readxl)
library(httr)
url <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1294/datasets/attendance.xls"
GET(url, write_disk("attendance.xls", overwrite=TRUE))
## Response [http://s3.amazonaws.com/assets.datacamp.com/production/course_1294/datasets/attendance.xls]
##   Date: 2018-05-04 13:41
##   Status: 200
##   Content-Type: 
##   Size: 28.2 kB
## <ON DISK>  D:\DATA\python\R - notes\Data Scientist with R\7 - importing-cleaning-data-in-r-case-studies\attendance.xls
att <- read_excel("attendance.xls")
head(att, 3)
## # A tibble: 3 x 17
##   `Table 43. Averag~ X__1   X__2 X__3   X__4 X__5   X__6 X__7   X__8 X__9 
##   <chr>              <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr>
## 1 <NA>               Tota~    NA <NA>     NA <NA>     NA <NA>     NA Elem~
## 2 <NA>               ADA ~    NA Aver~    NA Aver~    NA Aver~    NA ADA ~
## 3 1                  2        NA 3        NA 4        NA 5        NA 6    
## # ... with 7 more variables: X__10 <dbl>, X__11 <chr>, X__12 <dbl>,
## #   X__13 <chr>, X__14 <chr>, X__15 <chr>, X__16 <chr>

Ex2: Examine Data

first row is a description of the data second row is a variable itself that groups multiple columns together fourth row gives numbers for the columns – nice in a spreadsheet – not very useful

messy data - column names are mostly missing

names(att)
##  [1] "Table 43. Average daily attendance (ADA) as a percentage of total enrollment, school day length, and school year length in public schools, by school level and state: 2007-08"
##  [2] "X__1"                                                                                                                                                                         
##  [3] "X__2"                                                                                                                                                                         
##  [4] "X__3"                                                                                                                                                                         
##  [5] "X__4"                                                                                                                                                                         
##  [6] "X__5"                                                                                                                                                                         
##  [7] "X__6"                                                                                                                                                                         
##  [8] "X__7"                                                                                                                                                                         
##  [9] "X__8"                                                                                                                                                                         
## [10] "X__9"                                                                                                                                                                         
## [11] "X__10"                                                                                                                                                                        
## [12] "X__11"                                                                                                                                                                        
## [13] "X__12"                                                                                                                                                                        
## [14] "X__13"                                                                                                                                                                        
## [15] "X__14"                                                                                                                                                                        
## [16] "X__15"                                                                                                                                                                        
## [17] "X__16"
head(att)
## # A tibble: 6 x 17
##   `Table 43. Average~ X__1     X__2 X__3     X__4 X__5    X__6 X__7   X__8
##   <chr>               <chr>   <dbl> <chr>   <dbl> <chr>  <dbl> <chr> <dbl>
## 1 <NA>                Total~ NA     <NA>  NA      <NA>  NA     <NA>  NA   
## 2 <NA>                ADA a~ NA     Aver~ NA      Aver~ NA     Aver~ NA   
## 3 1                   2      NA     3     NA      4     NA     5     NA   
## 4 United States ....~ 93.07~  0.219 6.64~  0.0176 180    0.143 1192~  3.09
## 5 Alabama ..........~ 93.81~  1.24  7.02~  0.0656 180    0.755 1266~ 12.3 
## 6 Alaska ...........~ 89.91~  1.22  6.47~  0.0499 180    3.43  1162~ 22.9 
## # ... with 8 more variables: X__9 <chr>, X__10 <dbl>, X__11 <chr>,
## #   X__12 <dbl>, X__13 <chr>, X__14 <chr>, X__15 <chr>, X__16 <chr>

messy data - irrelevant notes at the end of the data frame

tail(att)
## # A tibble: 6 x 17
##   `Table 43. Average ~ X__1    X__2 X__3     X__4 X__5    X__6 X__7   X__8
##   <chr>                <chr>  <dbl> <chr>   <dbl> <chr>  <dbl> <chr> <dbl>
## 1 Wisconsin .........~ 94.9~  0.566 6.91~  0.0427 180    0.736 1246~  8.63
## 2 Wyoming ...........~ 92.3~  1.15  6.85~  0.0458 175    1.28  1200~  8.33
## 3 †Not applicable.     <NA>  NA     <NA>  NA      <NA>  NA     <NA>  NA   
## 4 ‡Reporting standard~ <NA>  NA     <NA>  NA      <NA>  NA     <NA>  NA   
## 5 NOTE: Averages refl~ <NA>  NA     <NA>  NA      <NA>  NA     <NA>  NA   
## 6 "SOURCE: U.S. Depar~ <NA>  NA     <NA>  NA      <NA>  NA     <NA>  NA   
## # ... with 8 more variables: X__9 <chr>, X__10 <dbl>, X__11 <chr>,
## #   X__12 <dbl>, X__13 <chr>, X__14 <chr>, X__15 <chr>, X__16 <chr>

messy data - numeric data were imported as factors

str(att)
## Classes 'tbl_df', 'tbl' and 'data.frame':    68 obs. of  17 variables:
##  $ Table 43. Average daily attendance (ADA) as a percentage of total enrollment, school day length, and school year length in public schools, by school level and state: 2007-08: chr  NA NA "1" "United States ........" ...
##  $ X__1                                                                                                                                                                         : chr  "Total elementary, secondary, and combined elementary/secondary schools" "ADA as percent of enrollment" "2" "93.078962000000004" ...
##  $ X__2                                                                                                                                                                         : num  NA NA NA 0.219 1.237 ...
##  $ X__3                                                                                                                                                                         : chr  NA "Average hours in school day" "3" "6.6447000000000003" ...
##  $ X__4                                                                                                                                                                         : num  NA NA NA 0.0176 0.0656 ...
##  $ X__5                                                                                                                                                                         : chr  NA "Average days in school year" "4" "180" ...
##  $ X__6                                                                                                                                                                         : num  NA NA NA 0.143 0.755 ...
##  $ X__7                                                                                                                                                                         : chr  NA "Average hours in school year" "5" "1192.6472000000001" ...
##  $ X__8                                                                                                                                                                         : num  NA NA NA 3.09 12.33 ...
##  $ X__9                                                                                                                                                                         : chr  "Elementary schools" "ADA as percent of enrollment" "6" "94.004982999999996" ...
##  $ X__10                                                                                                                                                                        : num  NA NA NA 0.269 1.839 ...
##  $ X__11                                                                                                                                                                        : chr  NA "Average hours in school day" "7" "6.6560560000000004" ...
##  $ X__12                                                                                                                                                                        : num  NA NA NA 0.016 0.0759 ...
##  $ X__13                                                                                                                                                                        : chr  "Secondary schools" "ADA as percent of enrollment" "8" "91.118081000000004" ...
##  $ X__14                                                                                                                                                                        : chr  NA NA NA "0.43222300000000002" ...
##  $ X__15                                                                                                                                                                        : chr  NA "Average hours in school day" "9" "6.5943940000000003" ...
##  $ X__16                                                                                                                                                                        : chr  NA NA NA "0.040307999999999997" ...

Ex3: Removing unnecessary rows

useless rows: 1, 4, 11, & 17

Do NOT do this: att2 <- att[-c(1, 4, 11, 17), ]

Because: * read.xls() function imported the first row of the original data frame as the variable name for the first column * first 6 rows of att aren’t the same as the first six rows you saw in the original spreadsheet * read.xls() function skips empty rows such as the 11th and 17th

get rid of the third row of att, as well as rows 56 through 59.


remove <- c(3, 56:59)

att2 <- att[-remove,]



head(att2)
## # A tibble: 6 x 17
##   `Table 43. Average~ X__1     X__2 X__3     X__4 X__5    X__6 X__7   X__8
##   <chr>               <chr>   <dbl> <chr>   <dbl> <chr>  <dbl> <chr> <dbl>
## 1 <NA>                Total~ NA     <NA>  NA      <NA>  NA     <NA>  NA   
## 2 <NA>                ADA a~ NA     Aver~ NA      Aver~ NA     Aver~ NA   
## 3 United States ....~ 93.07~  0.219 6.64~  0.0176 180    0.143 1192~  3.09
## 4 Alabama ..........~ 93.81~  1.24  7.02~  0.0656 180    0.755 1266~ 12.3 
## 5 Alaska ...........~ 89.91~  1.22  6.47~  0.0499 180    3.43  1162~ 22.9 
## 6 Arizona ..........~ 89.03~  2.95  6.43~  0.0919 181    1.68  1159~ 14.4 
## # ... with 8 more variables: X__9 <chr>, X__10 <dbl>, X__11 <chr>,
## #   X__12 <dbl>, X__13 <chr>, X__14 <chr>, X__15 <chr>, X__16 <chr>


str(att2)
## Classes 'tbl_df', 'tbl' and 'data.frame':    63 obs. of  17 variables:
##  $ Table 43. Average daily attendance (ADA) as a percentage of total enrollment, school day length, and school year length in public schools, by school level and state: 2007-08: chr  NA NA "United States ........" "Alabama ................." ...
##  $ X__1                                                                                                                                                                         : chr  "Total elementary, secondary, and combined elementary/secondary schools" "ADA as percent of enrollment" "93.078962000000004" "93.812370999999999" ...
##  $ X__2                                                                                                                                                                         : num  NA NA 0.219 1.237 1.216 ...
##  $ X__3                                                                                                                                                                         : chr  NA "Average hours in school day" "6.6447000000000003" "7.0285200000000003" ...
##  $ X__4                                                                                                                                                                         : num  NA NA 0.0176 0.0656 0.0499 ...
##  $ X__5                                                                                                                                                                         : chr  NA "Average days in school year" "180" "180" ...
##  $ X__6                                                                                                                                                                         : num  NA NA 0.143 0.755 3.428 ...
##  $ X__7                                                                                                                                                                         : chr  NA "Average hours in school year" "1192.6472000000001" "1266.6205" ...
##  $ X__8                                                                                                                                                                         : num  NA NA 3.09 12.33 22.86 ...
##  $ X__9                                                                                                                                                                         : chr  "Elementary schools" "ADA as percent of enrollment" "94.004982999999996" "93.776375000000002" ...
##  $ X__10                                                                                                                                                                        : num  NA NA 0.269 1.839 1.563 ...
##  $ X__11                                                                                                                                                                        : chr  NA "Average hours in school day" "6.6560560000000004" "7.0384729999999998" ...
##  $ X__12                                                                                                                                                                        : num  NA NA 0.016 0.0759 0.0531 ...
##  $ X__13                                                                                                                                                                        : chr  "Secondary schools" "ADA as percent of enrollment" "91.118081000000004" "94.561001000000005" ...
##  $ X__14                                                                                                                                                                        : chr  NA NA "0.43222300000000002" "0.37928000000000001" ...
##  $ X__15                                                                                                                                                                        : chr  NA "Average hours in school day" "6.5943940000000003" "7.1372390000000001" ...
##  $ X__16                                                                                                                                                                        : chr  NA NA "0.040307999999999997" "0.17312" ...

Ex4: Removing useless columns

columns 3, 5, 7, 9, 11, 13, 15, and 17 (or columns C, E, G, I, K, M, O, Q in Excel) don’t contain the values of average daily attendance (ADA)


remove <- c(seq(3, 17, 2))

att3 <- att2[,-remove]



head(att3)
## # A tibble: 6 x 9
##   `Table 43. Average ~ X__1    X__3   X__5   X__7  X__9  X__11 X__13 X__15
##   <chr>                <chr>   <chr>  <chr>  <chr> <chr> <chr> <chr> <chr>
## 1 <NA>                 Total ~ <NA>   <NA>   <NA>  Elem~ <NA>  Seco~ <NA> 
## 2 <NA>                 ADA as~ Avera~ Avera~ Aver~ ADA ~ Aver~ ADA ~ Aver~
## 3 United States .....~ 93.078~ 6.644~ 180    1192~ 94.0~ 6.65~ 91.1~ 6.59~
## 4 Alabama ...........~ 93.812~ 7.028~ 180    1266~ 93.7~ 7.03~ 94.5~ 7.13~
## 5 Alaska ............~ 89.917~ 6.476~ 180    1162~ 91.2~ 6.48~ 93.2~ 6.24~
## 6 Arizona ...........~ 89.036~ 6.433~ 181    1159~ 88.9~ 6.44~ 88.9~ 6.36~


str(att3)
## Classes 'tbl_df', 'tbl' and 'data.frame':    63 obs. of  9 variables:
##  $ Table 43. Average daily attendance (ADA) as a percentage of total enrollment, school day length, and school year length in public schools, by school level and state: 2007-08: chr  NA NA "United States ........" "Alabama ................." ...
##  $ X__1                                                                                                                                                                         : chr  "Total elementary, secondary, and combined elementary/secondary schools" "ADA as percent of enrollment" "93.078962000000004" "93.812370999999999" ...
##  $ X__3                                                                                                                                                                         : chr  NA "Average hours in school day" "6.6447000000000003" "7.0285200000000003" ...
##  $ X__5                                                                                                                                                                         : chr  NA "Average days in school year" "180" "180" ...
##  $ X__7                                                                                                                                                                         : chr  NA "Average hours in school year" "1192.6472000000001" "1266.6205" ...
##  $ X__9                                                                                                                                                                         : chr  "Elementary schools" "ADA as percent of enrollment" "94.004982999999996" "93.776375000000002" ...
##  $ X__11                                                                                                                                                                        : chr  NA "Average hours in school day" "6.6560560000000004" "7.0384729999999998" ...
##  $ X__13                                                                                                                                                                        : chr  "Secondary schools" "ADA as percent of enrollment" "91.118081000000004" "94.561001000000005" ...
##  $ X__15                                                                                                                                                                        : chr  NA "Average hours in school day" "6.5943940000000003" "7.1372390000000001" ...

Ex5: Splitting data

problem: single data frame can store multiple “tables” of information diagnose: look at the column names and noticing duplicate rows * columns 1, 6, and 7 represent attendance data for US elementary schools * columns 1, 8, and 9 represent data for secondary schools * columns 1 through 5 represent data for all schools in the US


# Subset just elementary schools: att_elem
att_elem <- att3[,c(1,6,7)]

# Subset just secondary schools: att_sec
att_sec <- att3[,c(1,8,9)]

# Subset all schools: att4
att4 <- att3[,c(1:5)]



head(att4)
## # A tibble: 6 x 5
##   `Table 43. Average daily attendan~ X__1          X__3    X__5    X__7   
##   <chr>                              <chr>         <chr>   <chr>   <chr>  
## 1 <NA>                               Total elemen~ <NA>    <NA>    <NA>   
## 2 <NA>                               ADA as perce~ Averag~ Averag~ Averag~
## 3 United States ........             93.078962000~ 6.6447~ 180     1192.6~
## 4 Alabama .................          93.812370999~ 7.0285~ 180     1266.6~
## 5 Alaska ..................          89.917597000~ 6.4768~ 180     1162.9~
## 6 Arizona .................          89.036961000~ 6.4336~ 181     1159.1~


str(att4)
## Classes 'tbl_df', 'tbl' and 'data.frame':    63 obs. of  5 variables:
##  $ Table 43. Average daily attendance (ADA) as a percentage of total enrollment, school day length, and school year length in public schools, by school level and state: 2007-08: chr  NA NA "United States ........" "Alabama ................." ...
##  $ X__1                                                                                                                                                                         : chr  "Total elementary, secondary, and combined elementary/secondary schools" "ADA as percent of enrollment" "93.078962000000004" "93.812370999999999" ...
##  $ X__3                                                                                                                                                                         : chr  NA "Average hours in school day" "6.6447000000000003" "7.0285200000000003" ...
##  $ X__5                                                                                                                                                                         : chr  NA "Average days in school year" "180" "180" ...
##  $ X__7                                                                                                                                                                         : chr  NA "Average hours in school year" "1192.6472000000001" "1266.6205" ...

Ex6: Replacing the names

# row storing the variable names 
# => actual column names of the data frame
# more stylistically sound:
cnames <- c("state", "avg_attend_pct", "avg_hr_per_day", 
            "avg_day_per_yr", "avg_hr_per_yr")

# Assign column names of att4
colnames(att4) <- cnames

# Remove first two rows of att4: att5
att5 <- att4[-c(1:2),]

# View the names of att5
colnames(att5)
## [1] "state"          "avg_attend_pct" "avg_hr_per_day" "avg_day_per_yr"
## [5] "avg_hr_per_yr"



head(att5)
## # A tibble: 6 x 5
##   state      avg_attend_pct  avg_hr_per_day  avg_day_per_yr avg_hr_per_yr 
##   <chr>      <chr>           <chr>           <chr>          <chr>         
## 1 United St~ 93.07896200000~ 6.644700000000~ 180            1192.64720000~
## 2 Alabama .~ 93.81237099999~ 7.028520000000~ 180            1266.6205     
## 3 Alaska ..~ 89.91759700000~ 6.476880000000~ 180            1162.9084     
## 4 Arizona .~ 89.03696100000~ 6.433690000000~ 181            1159.11439999~
## 5 Arkansas ~ 91.82711100000~ 6.885419999999~ 179            1228.88809999~
## 6 Californi~ 93.24101699999~ 6.24064         181            1128.76939999~


str(att5)
## Classes 'tbl_df', 'tbl' and 'data.frame':    61 obs. of  5 variables:
##  $ state         : chr  "United States ........" "Alabama ................." "Alaska .................." "Arizona ................." ...
##  $ avg_attend_pct: chr  "93.078962000000004" "93.812370999999999" "89.917597000000001" "89.036961000000005" ...
##  $ avg_hr_per_day: chr  "6.6447000000000003" "7.0285200000000003" "6.4768800000000004" "6.4336900000000004" ...
##  $ avg_day_per_yr: chr  "180" "180" "180" "181" ...
##  $ avg_hr_per_yr : chr  "1192.6472000000001" "1266.6205" "1162.9084" "1159.1143999999999" ...

Ex7: Cleaning up extra characters

# state names are all stored as the same number of characters, 
# with periods padding the ends of the shorter states.
# be careful
# . is a special character in the language of regular expressions (a.k.a. regex)
# To specify that you want to remove periods => "\\." => "escape" sequence.

# str_replace_all() 
# 3 arguments
# 1) string vector
# 2) pattern to match
# 3) replacement string.

# Remove all periods in state column
att5$state <- str_replace_all(att5$state, "\\.", "")

# Remove white space around state names
att5$state <- str_trim(att5$state)


# View the head of att5
head(att5)
## # A tibble: 6 x 5
##   state         avg_attend_pct avg_hr_per_day avg_day_per_yr avg_hr_per_yr
##   <chr>         <chr>          <chr>          <chr>          <chr>        
## 1 United States 93.0789620000~ 6.64470000000~ 180            1192.6472000~
## 2 Alabama       93.8123709999~ 7.02852000000~ 180            1266.6205    
## 3 Alaska        89.9175970000~ 6.47688000000~ 180            1162.9084    
## 4 Arizona       89.0369610000~ 6.43369000000~ 181            1159.1143999~
## 5 Arkansas      91.8271110000~ 6.88541999999~ 179            1228.8880999~
## 6 California    93.2410169999~ 6.24064        181            1128.7693999~


str(att5)
## Classes 'tbl_df', 'tbl' and 'data.frame':    61 obs. of  5 variables:
##  $ state         : chr  "United States" "Alabama" "Alaska" "Arizona" ...
##  $ avg_attend_pct: chr  "93.078962000000004" "93.812370999999999" "89.917597000000001" "89.036961000000005" ...
##  $ avg_hr_per_day: chr  "6.6447000000000003" "7.0285200000000003" "6.4768800000000004" "6.4336900000000004" ...
##  $ avg_day_per_yr: chr  "180" "180" "180" "181" ...
##  $ avg_hr_per_yr : chr  "1192.6472000000001" "1266.6205" "1162.9084" "1159.1143999999999" ...

Ex8: Some final type conversions

# dplyr package offers an efficient method for applying a function to many columns at once
library(dplyr)

# convert the values in certain variables to numerics (instead of factors)
# Change columns to numeric
example <- mutate_each(att5, funs(as.numeric), -state)
## `mutate_each()` is deprecated.
## Use `mutate_all()`, `mutate_at()` or `mutate_if()` instead.
## To map `funs` over a selection of variables, use `mutate_at()`

# Define vector containing numerical columns: cols
cols <- 2:ncol(att5)
# contains indices of columns containing numerical data
# only column without numerical data is column 1

# Use sapply to coerce cols to numeric
att5[, cols] <- sapply(att5[, cols], as.numeric)
# single bracket subsetting



head(att5)
## # A tibble: 6 x 5
##   state         avg_attend_pct avg_hr_per_day avg_day_per_yr avg_hr_per_yr
##   <chr>                  <dbl>          <dbl>          <dbl>         <dbl>
## 1 United States           93.1           6.64            180         1193.
## 2 Alabama                 93.8           7.03            180         1267.
## 3 Alaska                  89.9           6.48            180         1163.
## 4 Arizona                 89.0           6.43            181         1159.
## 5 Arkansas                91.8           6.89            179         1229.
## 6 California              93.2           6.24            181         1129.


str(att5)
## Classes 'tbl_df', 'tbl' and 'data.frame':    61 obs. of  5 variables:
##  $ state         : chr  "United States" "Alabama" "Alaska" "Arizona" ...
##  $ avg_attend_pct: num  93.1 93.8 89.9 89 91.8 ...
##  $ avg_hr_per_day: num  6.64 7.03 6.48 6.43 6.89 ...
##  $ avg_day_per_yr: num  180 180 180 181 179 181 NA 171 181 181 ...
##  $ avg_hr_per_yr : num  1193 1267 1163 1159 1229 ...