Introduction


Course notes for Importing and Cleaning Data in R: Case Studies

Whats Covered

  • Ticket Sales Data
  • MBTA Ridership Data
  • World Food Facts
  • School Attendance Data

   


Ticket Sales Data


Importing the data

# Import sales.csv: sales
url_sales <- 'http://s3.amazonaws.com/assets.datacamp.com/production/course_1294/datasets/sales.csv'
sales <- read.csv(url_sales, stringsAsFactors=F)

Examining the data

# View dimensions of sales
dim(sales)
## [1] 5000   46
# Inspect first 6 rows of sales
head(sales)
##   X             event_id       primary_act_id     secondary_act_id
## 1 1 abcaf1adb99a935fc661 43f0436b905bfa7c2eec b85143bf51323b72e53c
## 2 2 6c56d7f08c95f2aa453c 1a3e9aecd0617706a794 f53529c5679ea6ca5a48
## 3 3 c7ab4524a121f9d687d2 4b677c3f5bec71eec8d1 b85143bf51323b72e53c
## 4 4 394cb493f893be9b9ed1 b1ccea01ad6ef8522796 b85143bf51323b72e53c
## 5 5 55b5f67e618557929f48 91c03a34b562436efa3c b85143bf51323b72e53c
## 6 6 4f10fd8b9f550352bd56 ac4b847b3fde66f2117e 63814f3d63317f1b56c4
##    purch_party_lkup_id
## 1 7dfa56dd7d5956b17587
## 2 4f9e6fc637eaf7b736c2
## 3 6c2545703bd527a7144d
## 4 527d6b1eaffc69ddd882
## 5 8bd62c394a35213bdf52
## 6 3b3a628f83135acd0676
##                                                       event_name
## 1 Xfinity Center Mansfield Premier Parking: Florida Georgia Line
## 2                  Gorge Camping - dave matthews band - sept 3-7
## 3                    Dodge Theatre Adams Street Parking - benise
## 4   Gexa Energy Pavilion Vip Parking : kid rock with sheryl crow
## 5                                  Premier Parking - motley crue
## 6                                      Fast Lane Access: Journey
##                           primary_act_name secondary_act_name
## 1 XFINITY Center Mansfield Premier Parking               NULL
## 2                            Gorge Camping Dave Matthews Band
## 3                            Parking Event               NULL
## 4         Gexa Energy Pavilion VIP Parking               NULL
## 5 White River Amphitheatre Premier Parking               NULL
## 6                         Fast Lane Access            Journey
##   major_cat_name         minor_cat_name la_event_type_cat
## 1           MISC                PARKING           PARKING
## 2           MISC                CAMPING           INVALID
## 3           MISC                PARKING           PARKING
## 4           MISC                PARKING           PARKING
## 5           MISC                PARKING           PARKING
## 6           MISC SPECIAL ENTRY (UPSELL)            UPSELL
##                                                  event_disp_name
## 1 Xfinity Center Mansfield Premier Parking: Florida Georgia Line
## 2                  Gorge Camping - dave matthews band - sept 3-7
## 3                    Dodge Theatre Adams Street Parking - benise
## 4   Gexa Energy Pavilion Vip Parking : kid rock with sheryl crow
## 5                                  Premier Parking - motley crue
## 6                                      Fast Lane Access: Journey
##                                                                                                                                                    ticket_text
## 1    THIS TICKET IS VALID        FOR PARKING ONLY         GOOD THIS DAY ONLY       PREMIER PARKING PASS    XFINITY CENTER,LOTS 4 PM  SAT SEP 12 2015 7:30 PM  
## 2                                                                %OVERNIGHT C A M P I N G%* * * * * *%GORGE CAMPGROUND%* GOOD THIS DATE ONLY *%SEP 3 - 6, 2009
## 3                               ADAMS STREET GARAGE%PARKING FOR 4/21/06 ONLY%DODGE THEATRE PARKING PASS%ENTRANCE ON ADAMS STREET%BENISE%GARAGE OPENS AT 6:00PM
## 4    THIS TICKET IS VALID        FOR PARKING ONLY      GOOD FOR THIS DATE ONLY       VIP PARKING PASS        GEXA ENERGY PAVILION    FRI SEP 02 2011 7:00 PM  
## 5                              THIS TICKET IS VALID%FOR PARKING ONLY%GOOD THIS DATE ONLY%PREMIER PARKING PASS%WHITE RIVER AMPHITHEATRE%SAT JUL 30, 2005 6:00PM
## 6         FAST LANE                  JOURNEY               FAST LANE EVENT         THIS IS NOT A TICKET    SAN MANUEL AMPHITHEATER   SAT JUL 21 2012 7:00 PM  
##   tickets_purchased_qty trans_face_val_amt delivery_type_cd
## 1                     1                 45          eTicket
## 2                     1                 75       TicketFast
## 3                     1                  5       TicketFast
## 4                     1                 20             Mail
## 5                     1                 20             Mail
## 6                     2                 10       TicketFast
##       event_date_time   event_dt presale_dt  onsale_dt
## 1 2015-09-12 23:30:00 2015-09-12       NULL 2015-05-15
## 2 2009-09-05 01:00:00 2009-09-04       NULL 2009-03-13
## 3 2006-04-22 01:30:00 2006-04-21       NULL 2006-02-25
## 4 2011-09-03 00:00:00 2011-09-02       NULL 2011-04-22
## 5 2005-07-31 01:00:00 2005-07-30 2005-03-02 2005-03-04
## 6 2012-07-22 02:00:00 2012-07-21       NULL 2012-04-11
##   sales_ord_create_dttm sales_ord_tran_dt   print_dt timezn_nm
## 1   2015-09-11 18:17:45        2015-09-11 2015-09-12       EST
## 2   2009-07-06 00:00:00        2009-07-05 2009-09-01       PST
## 3   2006-04-05 00:00:00        2006-04-05 2006-04-05       MST
## 4   2011-07-01 17:38:50        2011-07-01 2011-07-06       CST
## 5   2005-06-18 00:00:00        2005-06-18 2005-06-28       PST
## 6   2012-07-21 17:20:18        2012-07-21 2012-07-21       PST
##       venue_city   venue_state venue_postal_cd_sgmt_1
## 1      MANSFIELD MASSACHUSETTS                  02048
## 2         QUINCY    WASHINGTON                  98848
## 3        PHOENIX       ARIZONA                  85003
## 4         DALLAS         TEXAS                  75210
## 5         AUBURN    WASHINGTON                  98092
## 6 SAN BERNARDINO    CALIFORNIA                  92407
##             sales_platform_cd print_flg la_valid_tkt_event_flg  fin_mkt_nm
## 1 www.concerts.livenation.com        T                      N       Boston
## 2                        NULL        T                      N      Seattle
## 3                        NULL        T                      N      Arizona
## 4                        NULL        T                      N       Dallas
## 5                        NULL        T                      N      Seattle
## 6          www.livenation.com        T                      N  Los Angeles
##   web_session_cookie_val gndr_cd age_yr income_amt edu_val
## 1   7dfa56dd7d5956b17587    <NA>   <NA>       <NA>    <NA>
## 2   4f9e6fc637eaf7b736c2    <NA>   <NA>       <NA>    <NA>
## 3   6c2545703bd527a7144d    <NA>   <NA>       <NA>    <NA>
## 4   527d6b1eaffc69ddd882    <NA>   <NA>       <NA>    <NA>
## 5   8bd62c394a35213bdf52    <NA>   <NA>       <NA>    <NA>
## 6   3b3a628f83135acd0676    <NA>   <NA>       <NA>    <NA>
##   edu_1st_indv_val edu_2nd_indv_val adults_in_hh_num married_ind
## 1             <NA>             <NA>             <NA>        <NA>
## 2             <NA>             <NA>             <NA>        <NA>
## 3             <NA>             <NA>             <NA>        <NA>
## 4             <NA>             <NA>             <NA>        <NA>
## 5             <NA>             <NA>             <NA>        <NA>
## 6             <NA>             <NA>             <NA>        <NA>
##   child_present_ind home_owner_ind occpn_val occpn_1st_val occpn_2nd_val
## 1              <NA>           <NA>      <NA>          <NA>          <NA>
## 2              <NA>           <NA>      <NA>          <NA>          <NA>
## 3              <NA>           <NA>      <NA>          <NA>          <NA>
## 4              <NA>           <NA>      <NA>          <NA>          <NA>
## 5              <NA>           <NA>      <NA>          <NA>          <NA>
## 6              <NA>           <NA>      <NA>          <NA>          <NA>
##   dist_to_ven
## 1          NA
## 2          59
## 3          NA
## 4          NA
## 5          NA
## 6          NA
# View column names of sales
names(sales)
##  [1] "X"                      "event_id"              
##  [3] "primary_act_id"         "secondary_act_id"      
##  [5] "purch_party_lkup_id"    "event_name"            
##  [7] "primary_act_name"       "secondary_act_name"    
##  [9] "major_cat_name"         "minor_cat_name"        
## [11] "la_event_type_cat"      "event_disp_name"       
## [13] "ticket_text"            "tickets_purchased_qty" 
## [15] "trans_face_val_amt"     "delivery_type_cd"      
## [17] "event_date_time"        "event_dt"              
## [19] "presale_dt"             "onsale_dt"             
## [21] "sales_ord_create_dttm"  "sales_ord_tran_dt"     
## [23] "print_dt"               "timezn_nm"             
## [25] "venue_city"             "venue_state"           
## [27] "venue_postal_cd_sgmt_1" "sales_platform_cd"     
## [29] "print_flg"              "la_valid_tkt_event_flg"
## [31] "fin_mkt_nm"             "web_session_cookie_val"
## [33] "gndr_cd"                "age_yr"                
## [35] "income_amt"             "edu_val"               
## [37] "edu_1st_indv_val"       "edu_2nd_indv_val"      
## [39] "adults_in_hh_num"       "married_ind"           
## [41] "child_present_ind"      "home_owner_ind"        
## [43] "occpn_val"              "occpn_1st_val"         
## [45] "occpn_2nd_val"          "dist_to_ven"

Summarizing the data

# Look at structure of sales
str(sales)
## 'data.frame':    5000 obs. of  46 variables:
##  $ X                     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ event_id              : chr  "abcaf1adb99a935fc661" "6c56d7f08c95f2aa453c" "c7ab4524a121f9d687d2" "394cb493f893be9b9ed1" ...
##  $ primary_act_id        : chr  "43f0436b905bfa7c2eec" "1a3e9aecd0617706a794" "4b677c3f5bec71eec8d1" "b1ccea01ad6ef8522796" ...
##  $ secondary_act_id      : chr  "b85143bf51323b72e53c" "f53529c5679ea6ca5a48" "b85143bf51323b72e53c" "b85143bf51323b72e53c" ...
##  $ purch_party_lkup_id   : chr  "7dfa56dd7d5956b17587" "4f9e6fc637eaf7b736c2" "6c2545703bd527a7144d" "527d6b1eaffc69ddd882" ...
##  $ event_name            : chr  "Xfinity Center Mansfield Premier Parking: Florida Georgia Line" "Gorge Camping - dave matthews band - sept 3-7" "Dodge Theatre Adams Street Parking - benise" "Gexa Energy Pavilion Vip Parking : kid rock with sheryl crow" ...
##  $ primary_act_name      : chr  "XFINITY Center Mansfield Premier Parking" "Gorge Camping" "Parking Event" "Gexa Energy Pavilion VIP Parking" ...
##  $ secondary_act_name    : chr  "NULL" "Dave Matthews Band" "NULL" "NULL" ...
##  $ major_cat_name        : chr  "MISC" "MISC" "MISC" "MISC" ...
##  $ minor_cat_name        : chr  "PARKING" "CAMPING" "PARKING" "PARKING" ...
##  $ la_event_type_cat     : chr  "PARKING" "INVALID" "PARKING" "PARKING" ...
##  $ event_disp_name       : chr  "Xfinity Center Mansfield Premier Parking: Florida Georgia Line" "Gorge Camping - dave matthews band - sept 3-7" "Dodge Theatre Adams Street Parking - benise" "Gexa Energy Pavilion Vip Parking : kid rock with sheryl crow" ...
##  $ ticket_text           : chr  "   THIS TICKET IS VALID        FOR PARKING ONLY         GOOD THIS DAY ONLY       PREMIER PARKING PASS    XFINIT"| __truncated__ "%OVERNIGHT C A M P I N G%* * * * * *%GORGE CAMPGROUND%* GOOD THIS DATE ONLY *%SEP 3 - 6, 2009" "ADAMS STREET GARAGE%PARKING FOR 4/21/06 ONLY%DODGE THEATRE PARKING PASS%ENTRANCE ON ADAMS STREET%BENISE%GARAGE OPENS AT 6:00PM" "   THIS TICKET IS VALID        FOR PARKING ONLY      GOOD FOR THIS DATE ONLY       VIP PARKING PASS        GEXA"| __truncated__ ...
##  $ tickets_purchased_qty : int  1 1 1 1 1 2 1 1 1 1 ...
##  $ trans_face_val_amt    : num  45 75 5 20 20 10 30 28 20 25 ...
##  $ delivery_type_cd      : chr  "eTicket" "TicketFast" "TicketFast" "Mail" ...
##  $ event_date_time       : chr  "2015-09-12 23:30:00" "2009-09-05 01:00:00" "2006-04-22 01:30:00" "2011-09-03 00:00:00" ...
##  $ event_dt              : chr  "2015-09-12" "2009-09-04" "2006-04-21" "2011-09-02" ...
##  $ presale_dt            : chr  "NULL" "NULL" "NULL" "NULL" ...
##  $ onsale_dt             : chr  "2015-05-15" "2009-03-13" "2006-02-25" "2011-04-22" ...
##  $ sales_ord_create_dttm : chr  "2015-09-11 18:17:45" "2009-07-06 00:00:00" "2006-04-05 00:00:00" "2011-07-01 17:38:50" ...
##  $ sales_ord_tran_dt     : chr  "2015-09-11" "2009-07-05" "2006-04-05" "2011-07-01" ...
##  $ print_dt              : chr  "2015-09-12" "2009-09-01" "2006-04-05" "2011-07-06" ...
##  $ timezn_nm             : chr  "EST" "PST" "MST" "CST" ...
##  $ venue_city            : chr  "MANSFIELD" "QUINCY" "PHOENIX" "DALLAS" ...
##  $ venue_state           : chr  "MASSACHUSETTS" "WASHINGTON" "ARIZONA" "TEXAS" ...
##  $ venue_postal_cd_sgmt_1: chr  "02048" "98848" "85003" "75210" ...
##  $ sales_platform_cd     : chr  "www.concerts.livenation.com" "NULL" "NULL" "NULL" ...
##  $ print_flg             : chr  "T " "T " "T " "T " ...
##  $ la_valid_tkt_event_flg: chr  "N " "N " "N " "N " ...
##  $ fin_mkt_nm            : chr  "Boston" "Seattle" "Arizona" "Dallas" ...
##  $ web_session_cookie_val: chr  "7dfa56dd7d5956b17587" "4f9e6fc637eaf7b736c2" "6c2545703bd527a7144d" "527d6b1eaffc69ddd882" ...
##  $ gndr_cd               : chr  NA NA NA NA ...
##  $ age_yr                : chr  NA NA NA NA ...
##  $ income_amt            : chr  NA NA NA NA ...
##  $ edu_val               : chr  NA NA NA NA ...
##  $ edu_1st_indv_val      : chr  NA NA NA NA ...
##  $ edu_2nd_indv_val      : chr  NA NA NA NA ...
##  $ adults_in_hh_num      : chr  NA NA NA NA ...
##  $ married_ind           : chr  NA NA NA NA ...
##  $ child_present_ind     : chr  NA NA NA NA ...
##  $ home_owner_ind        : chr  NA NA NA NA ...
##  $ occpn_val             : chr  NA NA NA NA ...
##  $ occpn_1st_val         : chr  NA NA NA NA ...
##  $ occpn_2nd_val         : chr  NA NA NA NA ...
##  $ dist_to_ven           : int  NA 59 NA NA NA NA NA NA NA NA ...
# View a summary of sales
summary(sales)
##        X          event_id         primary_act_id     secondary_act_id  
##  Min.   :   1   Length:5000        Length:5000        Length:5000       
##  1st Qu.:1251   Class :character   Class :character   Class :character  
##  Median :2500   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2500                                                           
##  3rd Qu.:3750                                                           
##  Max.   :5000                                                           
##                                                                         
##  purch_party_lkup_id  event_name        primary_act_name  
##  Length:5000         Length:5000        Length:5000       
##  Class :character    Class :character   Class :character  
##  Mode  :character    Mode  :character   Mode  :character  
##                                                           
##                                                           
##                                                           
##                                                           
##  secondary_act_name major_cat_name     minor_cat_name    
##  Length:5000        Length:5000        Length:5000       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  la_event_type_cat  event_disp_name    ticket_text       
##  Length:5000        Length:5000        Length:5000       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  tickets_purchased_qty trans_face_val_amt delivery_type_cd  
##  Min.   :1.000         Min.   :   1.00    Length:5000       
##  1st Qu.:1.000         1st Qu.:  20.00    Class :character  
##  Median :1.000         Median :  30.00    Mode  :character  
##  Mean   :1.639         Mean   :  77.08                      
##  3rd Qu.:2.000         3rd Qu.:  85.00                      
##  Max.   :8.000         Max.   :1520.88                      
##                                                             
##  event_date_time      event_dt          presale_dt       
##  Length:5000        Length:5000        Length:5000       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##   onsale_dt         sales_ord_create_dttm sales_ord_tran_dt 
##  Length:5000        Length:5000           Length:5000       
##  Class :character   Class :character      Class :character  
##  Mode  :character   Mode  :character      Mode  :character  
##                                                             
##                                                             
##                                                             
##                                                             
##    print_dt          timezn_nm          venue_city       
##  Length:5000        Length:5000        Length:5000       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  venue_state        venue_postal_cd_sgmt_1 sales_platform_cd 
##  Length:5000        Length:5000            Length:5000       
##  Class :character   Class :character       Class :character  
##  Mode  :character   Mode  :character       Mode  :character  
##                                                              
##                                                              
##                                                              
##                                                              
##   print_flg         la_valid_tkt_event_flg  fin_mkt_nm       
##  Length:5000        Length:5000            Length:5000       
##  Class :character   Class :character       Class :character  
##  Mode  :character   Mode  :character       Mode  :character  
##                                                              
##                                                              
##                                                              
##                                                              
##  web_session_cookie_val   gndr_cd             age_yr         
##  Length:5000            Length:5000        Length:5000       
##  Class :character       Class :character   Class :character  
##  Mode  :character       Mode  :character   Mode  :character  
##                                                              
##                                                              
##                                                              
##                                                              
##   income_amt          edu_val          edu_1st_indv_val  
##  Length:5000        Length:5000        Length:5000       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  edu_2nd_indv_val   adults_in_hh_num   married_ind       
##  Length:5000        Length:5000        Length:5000       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  child_present_ind  home_owner_ind      occpn_val        
##  Length:5000        Length:5000        Length:5000       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  occpn_1st_val      occpn_2nd_val       dist_to_ven    
##  Length:5000        Length:5000        Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:  12.0  
##  Mode  :character   Mode  :character   Median :  26.0  
##                                        Mean   : 158.2  
##                                        3rd Qu.:  77.5  
##                                        Max.   :2548.0  
##                                        NA's   :4677
# Load dplyr
library(dplyr)

# Get a glimpse of sales
glimpse(sales)
## Observations: 5,000
## Variables: 46
## $ X                      <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ...
## $ event_id               <chr> "abcaf1adb99a935fc661", "6c56d7f08c95f2...
## $ primary_act_id         <chr> "43f0436b905bfa7c2eec", "1a3e9aecd06177...
## $ secondary_act_id       <chr> "b85143bf51323b72e53c", "f53529c5679ea6...
## $ purch_party_lkup_id    <chr> "7dfa56dd7d5956b17587", "4f9e6fc637eaf7...
## $ event_name             <chr> "Xfinity Center Mansfield Premier Parki...
## $ primary_act_name       <chr> "XFINITY Center Mansfield Premier Parki...
## $ secondary_act_name     <chr> "NULL", "Dave Matthews Band", "NULL", "...
## $ major_cat_name         <chr> "MISC", "MISC", "MISC", "MISC", "MISC",...
## $ minor_cat_name         <chr> "PARKING", "CAMPING", "PARKING", "PARKI...
## $ la_event_type_cat      <chr> "PARKING", "INVALID", "PARKING", "PARKI...
## $ event_disp_name        <chr> "Xfinity Center Mansfield Premier Parki...
## $ ticket_text            <chr> "   THIS TICKET IS VALID        FOR PAR...
## $ tickets_purchased_qty  <int> 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 4, ...
## $ trans_face_val_amt     <dbl> 45, 75, 5, 20, 20, 10, 30, 28, 20, 25, ...
## $ delivery_type_cd       <chr> "eTicket", "TicketFast", "TicketFast", ...
## $ event_date_time        <chr> "2015-09-12 23:30:00", "2009-09-05 01:0...
## $ event_dt               <chr> "2015-09-12", "2009-09-04", "2006-04-21...
## $ presale_dt             <chr> "NULL", "NULL", "NULL", "NULL", "2005-0...
## $ onsale_dt              <chr> "2015-05-15", "2009-03-13", "2006-02-25...
## $ sales_ord_create_dttm  <chr> "2015-09-11 18:17:45", "2009-07-06 00:0...
## $ sales_ord_tran_dt      <chr> "2015-09-11", "2009-07-05", "2006-04-05...
## $ print_dt               <chr> "2015-09-12", "2009-09-01", "2006-04-05...
## $ timezn_nm              <chr> "EST", "PST", "MST", "CST", "PST", "PST...
## $ venue_city             <chr> "MANSFIELD", "QUINCY", "PHOENIX", "DALL...
## $ venue_state            <chr> "MASSACHUSETTS", "WASHINGTON", "ARIZONA...
## $ venue_postal_cd_sgmt_1 <chr> "02048", "98848", "85003", "75210", "98...
## $ sales_platform_cd      <chr> "www.concerts.livenation.com", "NULL", ...
## $ print_flg              <chr> "T ", "T ", "T ", "T ", "T ", "T ", "T ...
## $ la_valid_tkt_event_flg <chr> "N ", "N ", "N ", "N ", "N ", "N ", "N ...
## $ fin_mkt_nm             <chr> "Boston", "Seattle", "Arizona", "Dallas...
## $ web_session_cookie_val <chr> "7dfa56dd7d5956b17587", "4f9e6fc637eaf7...
## $ gndr_cd                <chr> NA, NA, NA, NA, NA, NA, "M", NA, NA, NA...
## $ age_yr                 <chr> NA, NA, NA, NA, NA, NA, "28", NA, NA, N...
## $ income_amt             <chr> NA, NA, NA, NA, NA, NA, "112500", NA, N...
## $ edu_val                <chr> NA, NA, NA, NA, NA, NA, "High School", ...
## $ edu_1st_indv_val       <chr> NA, NA, NA, NA, NA, NA, "High School", ...
## $ edu_2nd_indv_val       <chr> NA, NA, NA, NA, NA, NA, "NULL", NA, NA,...
## $ adults_in_hh_num       <chr> NA, NA, NA, NA, NA, NA, "4", NA, NA, NA...
## $ married_ind            <chr> NA, NA, NA, NA, NA, NA, "0", NA, NA, NA...
## $ child_present_ind      <chr> NA, NA, NA, NA, NA, NA, "1", NA, NA, NA...
## $ home_owner_ind         <chr> NA, NA, NA, NA, NA, NA, "0", NA, NA, NA...
## $ occpn_val              <chr> NA, NA, NA, NA, NA, NA, "NULL", NA, NA,...
## $ occpn_1st_val          <chr> NA, NA, NA, NA, NA, NA, "Craftsman Blue...
## $ occpn_2nd_val          <chr> NA, NA, NA, NA, NA, NA, "NULL", NA, NA,...
## $ dist_to_ven            <int> NA, 59, NA, NA, NA, NA, NA, NA, NA, NA,...

Removing redundant info

# Remove the first column of sales: sales2
sales2 <- sales[,-1]
str(sales[,1:5])
## 'data.frame':    5000 obs. of  5 variables:
##  $ X                  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ event_id           : chr  "abcaf1adb99a935fc661" "6c56d7f08c95f2aa453c" "c7ab4524a121f9d687d2" "394cb493f893be9b9ed1" ...
##  $ primary_act_id     : chr  "43f0436b905bfa7c2eec" "1a3e9aecd0617706a794" "4b677c3f5bec71eec8d1" "b1ccea01ad6ef8522796" ...
##  $ secondary_act_id   : chr  "b85143bf51323b72e53c" "f53529c5679ea6ca5a48" "b85143bf51323b72e53c" "b85143bf51323b72e53c" ...
##  $ purch_party_lkup_id: chr  "7dfa56dd7d5956b17587" "4f9e6fc637eaf7b736c2" "6c2545703bd527a7144d" "527d6b1eaffc69ddd882" ...
str(sales2[,1:5])
## 'data.frame':    5000 obs. of  5 variables:
##  $ event_id           : chr  "abcaf1adb99a935fc661" "6c56d7f08c95f2aa453c" "c7ab4524a121f9d687d2" "394cb493f893be9b9ed1" ...
##  $ primary_act_id     : chr  "43f0436b905bfa7c2eec" "1a3e9aecd0617706a794" "4b677c3f5bec71eec8d1" "b1ccea01ad6ef8522796" ...
##  $ secondary_act_id   : chr  "b85143bf51323b72e53c" "f53529c5679ea6ca5a48" "b85143bf51323b72e53c" "b85143bf51323b72e53c" ...
##  $ purch_party_lkup_id: chr  "7dfa56dd7d5956b17587" "4f9e6fc637eaf7b736c2" "6c2545703bd527a7144d" "527d6b1eaffc69ddd882" ...
##  $ event_name         : chr  "Xfinity Center Mansfield Premier Parking: Florida Georgia Line" "Gorge Camping - dave matthews band - sept 3-7" "Dodge Theatre Adams Street Parking - benise" "Gexa Energy Pavilion Vip Parking : kid rock with sheryl crow" ...

Information not worth keeping

# Define a vector of column indices: keep
## We don't want the first 4 coumns or the last 15
keep <- seq(5,ncol(sales2)-15,1)

# Subset sales2 using keep: sales3
sales3 <- sales2[,keep]
glimpse(sales3)
## Observations: 5,000
## Variables: 26
## $ event_name             <chr> "Xfinity Center Mansfield Premier Parki...
## $ primary_act_name       <chr> "XFINITY Center Mansfield Premier Parki...
## $ secondary_act_name     <chr> "NULL", "Dave Matthews Band", "NULL", "...
## $ major_cat_name         <chr> "MISC", "MISC", "MISC", "MISC", "MISC",...
## $ minor_cat_name         <chr> "PARKING", "CAMPING", "PARKING", "PARKI...
## $ la_event_type_cat      <chr> "PARKING", "INVALID", "PARKING", "PARKI...
## $ event_disp_name        <chr> "Xfinity Center Mansfield Premier Parki...
## $ ticket_text            <chr> "   THIS TICKET IS VALID        FOR PAR...
## $ tickets_purchased_qty  <int> 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 4, ...
## $ trans_face_val_amt     <dbl> 45, 75, 5, 20, 20, 10, 30, 28, 20, 25, ...
## $ delivery_type_cd       <chr> "eTicket", "TicketFast", "TicketFast", ...
## $ event_date_time        <chr> "2015-09-12 23:30:00", "2009-09-05 01:0...
## $ event_dt               <chr> "2015-09-12", "2009-09-04", "2006-04-21...
## $ presale_dt             <chr> "NULL", "NULL", "NULL", "NULL", "2005-0...
## $ onsale_dt              <chr> "2015-05-15", "2009-03-13", "2006-02-25...
## $ sales_ord_create_dttm  <chr> "2015-09-11 18:17:45", "2009-07-06 00:0...
## $ sales_ord_tran_dt      <chr> "2015-09-11", "2009-07-05", "2006-04-05...
## $ print_dt               <chr> "2015-09-12", "2009-09-01", "2006-04-05...
## $ timezn_nm              <chr> "EST", "PST", "MST", "CST", "PST", "PST...
## $ venue_city             <chr> "MANSFIELD", "QUINCY", "PHOENIX", "DALL...
## $ venue_state            <chr> "MASSACHUSETTS", "WASHINGTON", "ARIZONA...
## $ venue_postal_cd_sgmt_1 <chr> "02048", "98848", "85003", "75210", "98...
## $ sales_platform_cd      <chr> "www.concerts.livenation.com", "NULL", ...
## $ print_flg              <chr> "T ", "T ", "T ", "T ", "T ", "T ", "T ...
## $ la_valid_tkt_event_flg <chr> "N ", "N ", "N ", "N ", "N ", "N ", "N ...
## $ fin_mkt_nm             <chr> "Boston", "Seattle", "Arizona", "Dallas...

Separating columns

# Load tidyr
library(tidyr)

# Split event_date_time: sales4
head(sales3$event_date_time)
## [1] "2015-09-12 23:30:00" "2009-09-05 01:00:00" "2006-04-22 01:30:00"
## [4] "2011-09-03 00:00:00" "2005-07-31 01:00:00" "2012-07-22 02:00:00"
sales4 <- separate(sales3, event_date_time, c('event_dt', 'event_time'), sep = " ")

## check new columns
col <- str_detect(names(sales4),'event')
glimpse(sales4[,col])
## Observations: 5,000
## Variables: 7
## $ event_name             <chr> "Xfinity Center Mansfield Premier Parki...
## $ la_event_type_cat      <chr> "PARKING", "INVALID", "PARKING", "PARKI...
## $ event_disp_name        <chr> "Xfinity Center Mansfield Premier Parki...
## $ event_dt               <chr> "2015-09-12", "2009-09-05", "2006-04-22...
## $ event_time             <chr> "23:30:00", "01:00:00", "01:30:00", "00...
## $ event_dt.1             <chr> "2015-09-12", "2009-09-04", "2006-04-21...
## $ la_valid_tkt_event_flg <chr> "N ", "N ", "N ", "N ", "N ", "N ", "N ...
# Split sales_ord_create_dttm: sales5
head(sales4$sales_ord_create_dttm)
## [1] "2015-09-11 18:17:45" "2009-07-06 00:00:00" "2006-04-05 00:00:00"
## [4] "2011-07-01 17:38:50" "2005-06-18 00:00:00" "2012-07-21 17:20:18"
sales5 <- separate(sales4, sales_ord_create_dttm, c('ord_create_dt', 'ord_create_time'), sep = " ")

## check new columns
col <- str_detect(names(sales5),'ord_create')
glimpse(sales5[,col])
## Observations: 5,000
## Variables: 2
## $ ord_create_dt   <chr> "2015-09-11", "2009-07-06", "2006-04-05", "201...
## $ ord_create_time <chr> "18:17:45", "00:00:00", "00:00:00", "17:38:50"...

Dealing with warnings

  • When running the separate in the last code block the following warning is printed to the console
    • Warning message: Too few values at 4 locations: 2516, 3863, 4082, 4183
    • This does not show in the html doc becasue I have warnings=F set on all code chunks
    • I do this so warnings don’t make the final doc messy, but its still a good idea to check on them as they do in the next couple exercises
# Define an issues vector
issues <- c(2516, 3863, 4082, 4183)

# Print values of sales_ord_create_dttm at these indices
sales3$sales_ord_create_dttm[issues]
## [1] "NULL" "NULL" "NULL" "NULL"
# Print a well-behaved value of sales_ord_create_dttm
sales3$sales_ord_create_dttm[2517]
## [1] "2013-08-04 23:07:19"
  • This just shows that the values in these rows indicated in the warning are NULL.
  • So, it makes sense that there would be a warnging, but there is nothing we can do about it.

Identifying dates

# Load stringr
library(stringr)

# Find columns of sales5 containing "dt": date_cols
date_cols <- str_detect(colnames(sales5), "dt")
glimpse(sales5[,date_cols])
## Observations: 5,000
## Variables: 7
## $ event_dt          <chr> "2015-09-12", "2009-09-05", "2006-04-22", "2...
## $ event_dt.1        <chr> "2015-09-12", "2009-09-04", "2006-04-21", "2...
## $ presale_dt        <chr> "NULL", "NULL", "NULL", "NULL", "2005-03-02"...
## $ onsale_dt         <chr> "2015-05-15", "2009-03-13", "2006-02-25", "2...
## $ ord_create_dt     <chr> "2015-09-11", "2009-07-06", "2006-04-05", "2...
## $ sales_ord_tran_dt <chr> "2015-09-11", "2009-07-05", "2006-04-05", "2...
## $ print_dt          <chr> "2015-09-12", "2009-09-01", "2006-04-05", "2...
# Load lubridate
library(lubridate)

# Coerce date columns into Date objects
sales5[, date_cols] <- lapply(sales5[, date_cols], ymd)

# Check column types
glimpse(sales5[,date_cols])
## Observations: 5,000
## Variables: 7
## $ event_dt          <date> 2015-09-12, 2009-09-05, 2006-04-22, 2011-09...
## $ event_dt.1        <date> 2015-09-12, 2009-09-04, 2006-04-21, 2011-09...
## $ presale_dt        <date> NA, NA, NA, NA, 2005-03-02, NA, NA, NA, NA,...
## $ onsale_dt         <date> 2015-05-15, 2009-03-13, 2006-02-25, 2011-04...
## $ ord_create_dt     <date> 2015-09-11, 2009-07-06, 2006-04-05, 2011-07...
## $ sales_ord_tran_dt <date> 2015-09-11, 2009-07-05, 2006-04-05, 2011-07...
## $ print_dt          <date> 2015-09-12, 2009-09-01, 2006-04-05, 2011-07...

More warnings!

  • Aight, so when running the ymd function in the lapply above I get the following warnings in the console
    • Warning message: 2892 failed to parse.
    • Warning message: 101 failed to parse.
    • Warning message: 4 failed to parse.
    • Warning message: 424 failed to parse.
  • These warnings are saying there were values that could not be converted to a date.
    • We can check to see if these just match up to the number of values that are NA
    • We lapply the ia.na function on all date columns
    • Then count the number of TRUEs
    • And we see that the counts match the warnings exactly.
## stringr is loaded

# Find date columns (don't change)
date_cols <- str_detect(names(sales5), "dt")

# Create logical vectors indicating missing values (don't change)
missing <- lapply(sales5[, date_cols], is.na)

# Create a numerical vector that counts missing values: num_missing
num_missing <- sapply(missing, sum)

# Print num_missing
num_missing
##          event_dt        event_dt.1        presale_dt         onsale_dt 
##                 0                 0              2892               101 
##     ord_create_dt sales_ord_tran_dt          print_dt 
##                 4                 0               424

Combining columns

## tidyr is loaded

# Combine the venue_city and venue_state columns
sales6 <- unite(sales5, venue_city_state, venue_city, venue_state, sep = ", ")

# View the head of sales6
head(sales6)
##                                                       event_name
## 1 Xfinity Center Mansfield Premier Parking: Florida Georgia Line
## 2                  Gorge Camping - dave matthews band - sept 3-7
## 3                    Dodge Theatre Adams Street Parking - benise
## 4   Gexa Energy Pavilion Vip Parking : kid rock with sheryl crow
## 5                                  Premier Parking - motley crue
## 6                                      Fast Lane Access: Journey
##                           primary_act_name secondary_act_name
## 1 XFINITY Center Mansfield Premier Parking               NULL
## 2                            Gorge Camping Dave Matthews Band
## 3                            Parking Event               NULL
## 4         Gexa Energy Pavilion VIP Parking               NULL
## 5 White River Amphitheatre Premier Parking               NULL
## 6                         Fast Lane Access            Journey
##   major_cat_name         minor_cat_name la_event_type_cat
## 1           MISC                PARKING           PARKING
## 2           MISC                CAMPING           INVALID
## 3           MISC                PARKING           PARKING
## 4           MISC                PARKING           PARKING
## 5           MISC                PARKING           PARKING
## 6           MISC SPECIAL ENTRY (UPSELL)            UPSELL
##                                                  event_disp_name
## 1 Xfinity Center Mansfield Premier Parking: Florida Georgia Line
## 2                  Gorge Camping - dave matthews band - sept 3-7
## 3                    Dodge Theatre Adams Street Parking - benise
## 4   Gexa Energy Pavilion Vip Parking : kid rock with sheryl crow
## 5                                  Premier Parking - motley crue
## 6                                      Fast Lane Access: Journey
##                                                                                                                                                    ticket_text
## 1    THIS TICKET IS VALID        FOR PARKING ONLY         GOOD THIS DAY ONLY       PREMIER PARKING PASS    XFINITY CENTER,LOTS 4 PM  SAT SEP 12 2015 7:30 PM  
## 2                                                                %OVERNIGHT C A M P I N G%* * * * * *%GORGE CAMPGROUND%* GOOD THIS DATE ONLY *%SEP 3 - 6, 2009
## 3                               ADAMS STREET GARAGE%PARKING FOR 4/21/06 ONLY%DODGE THEATRE PARKING PASS%ENTRANCE ON ADAMS STREET%BENISE%GARAGE OPENS AT 6:00PM
## 4    THIS TICKET IS VALID        FOR PARKING ONLY      GOOD FOR THIS DATE ONLY       VIP PARKING PASS        GEXA ENERGY PAVILION    FRI SEP 02 2011 7:00 PM  
## 5                              THIS TICKET IS VALID%FOR PARKING ONLY%GOOD THIS DATE ONLY%PREMIER PARKING PASS%WHITE RIVER AMPHITHEATRE%SAT JUL 30, 2005 6:00PM
## 6         FAST LANE                  JOURNEY               FAST LANE EVENT         THIS IS NOT A TICKET    SAN MANUEL AMPHITHEATER   SAT JUL 21 2012 7:00 PM  
##   tickets_purchased_qty trans_face_val_amt delivery_type_cd   event_dt
## 1                     1                 45          eTicket 2015-09-12
## 2                     1                 75       TicketFast 2009-09-05
## 3                     1                  5       TicketFast 2006-04-22
## 4                     1                 20             Mail 2011-09-03
## 5                     1                 20             Mail 2005-07-31
## 6                     2                 10       TicketFast 2012-07-22
##   event_time event_dt.1 presale_dt  onsale_dt ord_create_dt
## 1   23:30:00 2015-09-12       <NA> 2015-05-15    2015-09-11
## 2   01:00:00 2009-09-04       <NA> 2009-03-13    2009-07-06
## 3   01:30:00 2006-04-21       <NA> 2006-02-25    2006-04-05
## 4   00:00:00 2011-09-02       <NA> 2011-04-22    2011-07-01
## 5   01:00:00 2005-07-30 2005-03-02 2005-03-04    2005-06-18
## 6   02:00:00 2012-07-21       <NA> 2012-04-11    2012-07-21
##   ord_create_time sales_ord_tran_dt   print_dt timezn_nm
## 1        18:17:45        2015-09-11 2015-09-12       EST
## 2        00:00:00        2009-07-05 2009-09-01       PST
## 3        00:00:00        2006-04-05 2006-04-05       MST
## 4        17:38:50        2011-07-01 2011-07-06       CST
## 5        00:00:00        2005-06-18 2005-06-28       PST
## 6        17:20:18        2012-07-21 2012-07-21       PST
##             venue_city_state venue_postal_cd_sgmt_1
## 1   MANSFIELD, MASSACHUSETTS                  02048
## 2         QUINCY, WASHINGTON                  98848
## 3           PHOENIX, ARIZONA                  85003
## 4              DALLAS, TEXAS                  75210
## 5         AUBURN, WASHINGTON                  98092
## 6 SAN BERNARDINO, CALIFORNIA                  92407
##             sales_platform_cd print_flg la_valid_tkt_event_flg  fin_mkt_nm
## 1 www.concerts.livenation.com        T                      N       Boston
## 2                        NULL        T                      N      Seattle
## 3                        NULL        T                      N      Arizona
## 4                        NULL        T                      N       Dallas
## 5                        NULL        T                      N      Seattle
## 6          www.livenation.com        T                      N  Los Angeles
head(sales6$venue_city_state)
## [1] "MANSFIELD, MASSACHUSETTS"   "QUINCY, WASHINGTON"        
## [3] "PHOENIX, ARIZONA"           "DALLAS, TEXAS"             
## [5] "AUBURN, WASHINGTON"         "SAN BERNARDINO, CALIFORNIA"

   


MBTA Ridership Data


Using readxl

  • I need to pull this dataset down from aws then load
# Load readxl
library(readxl)

# Import mbta.xlsx and skip first row: mbta
url_mbta <- 'http://s3.amazonaws.com/assets.datacamp.com/production/course_1294/datasets/mbta.xlsx'
download.file(url_mbta, 'data/mbta.xlsx')
mbta <- read_excel('data/mbta.xlsx', skip=1)

Examining the data

# View the structure of mbta
str(mbta)
## Classes 'tbl_df', 'tbl' and 'data.frame':    11 obs. of  60 variables:
##  $ X__1   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ mode   : chr  "All Modes by Qtr" "Boat" "Bus" "Commuter Rail" ...
##  $ 2007-01: chr  "NA" "4" "335.819" "142.2" ...
##  $ 2007-02: chr  "NA" "3.6" "338.675" "138.5" ...
##  $ 2007-03: num  1188 40 340 138 459 ...
##  $ 2007-04: chr  "NA" "4.3" "352.162" "139.5" ...
##  $ 2007-05: chr  "NA" "4.9" "354.367" "139" ...
##  $ 2007-06: num  1246 5.8 350.5 143 477 ...
##  $ 2007-07: chr  "NA" "6.521" "357.519" "142.391" ...
##  $ 2007-08: chr  "NA" "6.572" "355.479" "142.364" ...
##  $ 2007-09: num  1256.57 5.47 372.6 143.05 499.57 ...
##  $ 2007-10: chr  "NA" "5.145" "368.847" "146.542" ...
##  $ 2007-11: chr  "NA" "3.763" "330.826" "145.089" ...
##  $ 2007-12: num  1216.89 2.98 312.92 141.59 448.27 ...
##  $ 2008-01: chr  "NA" "3.175" "340.324" "142.145" ...
##  $ 2008-02: chr  "NA" "3.111" "352.905" "142.607" ...
##  $ 2008-03: num  1253.52 3.51 361.15 137.45 494.05 ...
##  $ 2008-04: chr  "NA" "4.164" "368.189" "140.389" ...
##  $ 2008-05: chr  "NA" "4.015" "363.903" "142.585" ...
##  $ 2008-06: num  1314.82 5.19 362.96 142.06 518.35 ...
##  $ 2008-07: chr  "NA" "6.016" "370.921" "145.731" ...
##  $ 2008-08: chr  "NA" "5.8" "361.057" "144.565" ...
##  $ 2008-09: num  1307.04 4.59 389.54 141.91 517.32 ...
##  $ 2008-10: chr  "NA" "4.285" "357.974" "151.957" ...
##  $ 2008-11: chr  "NA" "3.488" "345.423" "152.952" ...
##  $ 2008-12: num  1232.65 3.01 325.77 140.81 446.74 ...
##  $ 2009-01: chr  "NA" "3.014" "338.532" "141.448" ...
##  $ 2009-02: chr  "NA" "3.196" "360.412" "143.529" ...
##  $ 2009-03: num  1209.79 3.33 353.69 142.89 467.22 ...
##  $ 2009-04: chr  "NA" "4.049" "359.38" "142.34" ...
##  $ 2009-05: chr  "NA" "4.119" "354.75" "144.225" ...
##  $ 2009-06: num  1233.1 4.9 347.9 142 473.1 ...
##  $ 2009-07: chr  "NA" "6.444" "339.477" "137.691" ...
##  $ 2009-08: chr  "NA" "5.903" "332.661" "139.158" ...
##  $ 2009-09: num  1230.5 4.7 374.3 139.1 500.4 ...
##  $ 2009-10: chr  "NA" "4.212" "385.868" "137.104" ...
##  $ 2009-11: chr  "NA" "3.576" "366.98" "129.343" ...
##  $ 2009-12: num  1207.85 3.11 332.39 126.07 440.93 ...
##  $ 2010-01: chr  "NA" "3.207" "362.226" "130.91" ...
##  $ 2010-02: chr  "NA" "3.195" "361.138" "131.918" ...
##  $ 2010-03: num  1208.86 3.48 373.44 131.25 483.4 ...
##  $ 2010-04: chr  "NA" "4.452" "378.611" "131.722" ...
##  $ 2010-05: chr  "NA" "4.415" "380.171" "128.8" ...
##  $ 2010-06: num  1244.41 5.41 363.27 129.14 490.26 ...
##  $ 2010-07: chr  "NA" "6.513" "353.04" "122.935" ...
##  $ 2010-08: chr  "NA" "6.269" "343.688" "129.732" ...
##  $ 2010-09: num  1225.5 4.7 381.6 132.9 521.1 ...
##  $ 2010-10: chr  "NA" "4.402" "384.987" "131.033" ...
##  $ 2010-11: chr  "NA" "3.731" "367.955" "130.889" ...
##  $ 2010-12: num  1216.26 3.16 326.34 121.42 450.43 ...
##  $ 2011-01: chr  "NA" "3.14" "334.958" "128.396" ...
##  $ 2011-02: chr  "NA" "3.284" "346.234" "125.463" ...
##  $ 2011-03: num  1223.45 3.67 380.4 134.37 516.73 ...
##  $ 2011-04: chr  "NA" "4.251" "380.446" "134.169" ...
##  $ 2011-05: chr  "NA" "4.431" "385.289" "136.14" ...
##  $ 2011-06: num  1302.41 5.47 376.32 135.58 529.53 ...
##  $ 2011-07: chr  "NA" "6.581" "361.585" "132.41" ...
##  $ 2011-08: chr  "NA" "6.733" "353.793" "130.616" ...
##  $ 2011-09: num  1291 5 388 137 550 ...
##  $ 2011-10: chr  "NA" "4.484" "398.456" "128.72" ...
# View the first 6 rows of mbta
head(mbta)
## # A tibble: 6 x 60
##    X__1             mode `2007-01` `2007-02` `2007-03` `2007-04` `2007-05`
##   <dbl>            <chr>     <chr>     <chr>     <dbl>     <chr>     <chr>
## 1     1 All Modes by Qtr        NA        NA  1187.653        NA        NA
## 2     2             Boat         4       3.6    40.000       4.3       4.9
## 3     3              Bus   335.819   338.675   339.867   352.162   354.367
## 4     4    Commuter Rail     142.2     138.5   137.700     139.5       139
## 5     5       Heavy Rail   435.294   448.271   458.583   472.201   474.579
## 6     6       Light Rail   227.231   240.262   241.444   255.557   248.262
## # ... with 53 more variables: `2007-06` <dbl>, `2007-07` <chr>,
## #   `2007-08` <chr>, `2007-09` <dbl>, `2007-10` <chr>, `2007-11` <chr>,
## #   `2007-12` <dbl>, `2008-01` <chr>, `2008-02` <chr>, `2008-03` <dbl>,
## #   `2008-04` <chr>, `2008-05` <chr>, `2008-06` <dbl>, `2008-07` <chr>,
## #   `2008-08` <chr>, `2008-09` <dbl>, `2008-10` <chr>, `2008-11` <chr>,
## #   `2008-12` <dbl>, `2009-01` <chr>, `2009-02` <chr>, `2009-03` <dbl>,
## #   `2009-04` <chr>, `2009-05` <chr>, `2009-06` <dbl>, `2009-07` <chr>,
## #   `2009-08` <chr>, `2009-09` <dbl>, `2009-10` <chr>, `2009-11` <chr>,
## #   `2009-12` <dbl>, `2010-01` <chr>, `2010-02` <chr>, `2010-03` <dbl>,
## #   `2010-04` <chr>, `2010-05` <chr>, `2010-06` <dbl>, `2010-07` <chr>,
## #   `2010-08` <chr>, `2010-09` <dbl>, `2010-10` <chr>, `2010-11` <chr>,
## #   `2010-12` <dbl>, `2011-01` <chr>, `2011-02` <chr>, `2011-03` <dbl>,
## #   `2011-04` <chr>, `2011-05` <chr>, `2011-06` <dbl>, `2011-07` <chr>,
## #   `2011-08` <chr>, `2011-09` <dbl>, `2011-10` <chr>
# View a summary of mbta
summary(mbta)
##       X__1          mode             2007-01            2007-02         
##  Min.   : 1.0   Length:11          Length:11          Length:11         
##  1st Qu.: 3.5   Class :character   Class :character   Class :character  
##  Median : 6.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 6.0                                                           
##  3rd Qu.: 8.5                                                           
##  Max.   :11.0                                                           
##     2007-03           2007-04            2007-05         
##  Min.   :   0.114   Length:11          Length:11         
##  1st Qu.:   9.278   Class :character   Class :character  
##  Median : 137.700   Mode  :character   Mode  :character  
##  Mean   : 330.293                                        
##  3rd Qu.: 399.225                                        
##  Max.   :1204.725                                        
##     2007-06           2007-07            2007-08         
##  Min.   :   0.096   Length:11          Length:11         
##  1st Qu.:   5.700   Class :character   Class :character  
##  Median : 143.000   Mode  :character   Mode  :character  
##  Mean   : 339.846                                        
##  3rd Qu.: 413.788                                        
##  Max.   :1246.129                                        
##     2007-09           2007-10            2007-11         
##  Min.   :  -0.007   Length:11          Length:11         
##  1st Qu.:   5.539   Class :character   Class :character  
##  Median : 143.051   Mode  :character   Mode  :character  
##  Mean   : 352.554                                        
##  3rd Qu.: 436.082                                        
##  Max.   :1310.764                                        
##     2007-12           2008-01            2008-02         
##  Min.   :  -0.060   Length:11          Length:11         
##  1st Qu.:   4.385   Class :character   Class :character  
##  Median : 141.585   Mode  :character   Mode  :character  
##  Mean   : 321.588                                        
##  3rd Qu.: 380.594                                        
##  Max.   :1216.890                                        
##     2008-03           2008-04            2008-05         
##  Min.   :   0.058   Length:11          Length:11         
##  1st Qu.:   5.170   Class :character   Class :character  
##  Median : 137.453   Mode  :character   Mode  :character  
##  Mean   : 345.604                                        
##  3rd Qu.: 427.601                                        
##  Max.   :1274.031                                        
##     2008-06           2008-07            2008-08         
##  Min.   :   0.060   Length:11          Length:11         
##  1st Qu.:   5.742   Class :character   Class :character  
##  Median : 142.057   Mode  :character   Mode  :character  
##  Mean   : 359.667                                        
##  3rd Qu.: 440.656                                        
##  Max.   :1320.728                                        
##     2008-09           2008-10            2008-11         
##  Min.   :   0.021   Length:11          Length:11         
##  1st Qu.:   5.691   Class :character   Class :character  
##  Median : 141.907   Mode  :character   Mode  :character  
##  Mean   : 362.099                                        
##  3rd Qu.: 453.430                                        
##  Max.   :1338.015                                        
##     2008-12           2009-01            2009-02         
##  Min.   :  -0.015   Length:11          Length:11         
##  1st Qu.:   4.689   Class :character   Class :character  
##  Median : 140.810   Mode  :character   Mode  :character  
##  Mean   : 319.882                                        
##  3rd Qu.: 386.255                                        
##  Max.   :1232.655                                        
##     2009-03           2009-04            2009-05         
##  Min.   :  -0.050   Length:11          Length:11         
##  1st Qu.:   5.003   Class :character   Class :character  
##  Median : 142.893   Mode  :character   Mode  :character  
##  Mean   : 330.142                                        
##  3rd Qu.: 410.455                                        
##  Max.   :1210.912                                        
##     2009-06           2009-07            2009-08         
##  Min.   :  -0.079   Length:11          Length:11         
##  1st Qu.:   5.845   Class :character   Class :character  
##  Median : 142.006   Mode  :character   Mode  :character  
##  Mean   : 333.194                                        
##  3rd Qu.: 410.482                                        
##  Max.   :1233.085                                        
##     2009-09           2009-10            2009-11         
##  Min.   :  -0.035   Length:11          Length:11         
##  1st Qu.:   5.693   Class :character   Class :character  
##  Median : 139.087   Mode  :character   Mode  :character  
##  Mean   : 346.687                                        
##  3rd Qu.: 437.332                                        
##  Max.   :1291.564                                        
##     2009-12           2010-01            2010-02         
##  Min.   :  -0.022   Length:11          Length:11         
##  1st Qu.:   4.784   Class :character   Class :character  
##  Median : 126.066   Mode  :character   Mode  :character  
##  Mean   : 312.962                                        
##  3rd Qu.: 386.659                                        
##  Max.   :1207.845                                        
##     2010-03           2010-04            2010-05         
##  Min.   :   0.012   Length:11          Length:11         
##  1st Qu.:   5.274   Class :character   Class :character  
##  Median : 131.252   Mode  :character   Mode  :character  
##  Mean   : 332.726                                        
##  3rd Qu.: 428.420                                        
##  Max.   :1225.556                                        
##     2010-06           2010-07            2010-08         
##  Min.   :   0.008   Length:11          Length:11         
##  1st Qu.:   6.436   Class :character   Class :character  
##  Median : 129.144   Mode  :character   Mode  :character  
##  Mean   : 335.964                                        
##  3rd Qu.: 426.769                                        
##  Max.   :1244.409                                        
##     2010-09           2010-10            2010-11         
##  Min.   :   0.001   Length:11          Length:11         
##  1st Qu.:   5.567   Class :character   Class :character  
##  Median : 132.892   Mode  :character   Mode  :character  
##  Mean   : 346.524                                        
##  3rd Qu.: 451.361                                        
##  Max.   :1293.117                                        
##     2010-12           2011-01            2011-02         
##  Min.   :  -0.004   Length:11          Length:11         
##  1st Qu.:   4.466   Class :character   Class :character  
##  Median : 121.422   Mode  :character   Mode  :character  
##  Mean   : 312.917                                        
##  3rd Qu.: 388.385                                        
##  Max.   :1216.262                                        
##     2011-03          2011-04            2011-05         
##  Min.   :   0.05   Length:11          Length:11         
##  1st Qu.:   6.03   Class :character   Class :character  
##  Median : 134.37   Mode  :character   Mode  :character  
##  Mean   : 345.17                                        
##  3rd Qu.: 448.56                                        
##  Max.   :1286.66                                        
##     2011-06           2011-07            2011-08         
##  Min.   :   0.054   Length:11          Length:11         
##  1st Qu.:   6.926   Class :character   Class :character  
##  Median : 135.581   Mode  :character   Mode  :character  
##  Mean   : 353.331                                        
##  3rd Qu.: 452.923                                        
##  Max.   :1302.414                                        
##     2011-09           2011-10         
##  Min.   :   0.043   Length:11         
##  1st Qu.:   6.660   Class :character  
##  Median : 136.901   Mode  :character  
##  Mean   : 362.555                     
##  3rd Qu.: 469.204                     
##  Max.   :1348.754

Removing unnecessry rows and columns

  • Some of these rows are from analysis mixed in with the data in the excel sheet
    • This is super common in excel but makes the data messy and less usable
  • The first column is just the row number
# Remove rows 1, 7, and 11 of mbta: mbta2
keep <- !(mbta$mode %in% c('All Modes by Qtr', 'Pct Chg / Yr', 'TOTAL'))

mbta2 <- mbta[keep,]
glimpse(mbta2)
## Observations: 8
## Variables: 60
## $ X__1    <dbl> 2, 3, 4, 5, 6, 8, 9, 10
## $ mode    <chr> "Boat", "Bus", "Commuter Rail", "Heavy Rail", "Light R...
## $ 2007-01 <chr> "4", "335.819", "142.2", "435.294", "227.231", "4.772"...
## $ 2007-02 <chr> "3.6", "338.675", "138.5", "448.271", "240.262", "4.41...
## $ 2007-03 <dbl> 40.000, 339.867, 137.700, 458.583, 241.444, 4.574, 5.5...
## $ 2007-04 <chr> "4.3", "352.162", "139.5", "472.201", "255.557", "4.54...
## $ 2007-05 <chr> "4.9", "354.367", "139", "474.579", "248.262", "4.768"...
## $ 2007-06 <dbl> 5.800, 350.543, 143.000, 477.032, 246.108, 4.722, 5.60...
## $ 2007-07 <chr> "6.521", "357.519", "142.391", "471.735", "243.286", "...
## $ 2007-08 <chr> "6.572", "355.479", "142.364", "461.605", "234.907", "...
## $ 2007-09 <dbl> 5.469, 372.598, 143.051, 499.566, 265.748, 4.329, 5.60...
## $ 2007-10 <chr> "5.145", "368.847", "146.542", "457.741", "241.434", "...
## $ 2007-11 <chr> "3.763", "330.826", "145.089", "488.348", "250.497", "...
## $ 2007-12 <dbl> 2.985, 312.920, 141.585, 448.268, 233.379, 3.708, 5.06...
## $ 2008-01 <chr> "3.175", "340.324", "142.145", "472.624", "241.223", "...
## $ 2008-02 <chr> "3.111", "352.905", "142.607", "492.1", "249.306", "4....
## $ 2008-03 <dbl> 3.512, 361.155, 137.453, 494.046, 253.132, 4.175, 6.16...
## $ 2008-04 <chr> "4.164", "368.189", "140.389", "513.204", "271.07", "4...
## $ 2008-05 <chr> "4.015", "363.903", "142.585", "507.952", "258.351", "...
## $ 2008-06 <dbl> 5.189, 362.962, 142.057, 518.349, 266.961, 4.400, 6.29...
## $ 2008-07 <chr> "6.016", "370.921", "145.731", "512.309", "270.158", "...
## $ 2008-08 <chr> "5.8", "361.057", "144.565", "476.99", "239.344", "4.2...
## $ 2008-09 <dbl> 4.587, 389.537, 141.907, 517.324, 258.171, 4.878, 6.50...
## $ 2008-10 <chr> "4.285", "357.974", "151.957", "523.644", "250.063", "...
## $ 2008-11 <chr> "3.488", "345.423", "152.952", "487.115", "232.068", "...
## $ 2008-12 <dbl> 3.007, 325.767, 140.810, 446.743, 205.420, 3.420, 5.95...
## $ 2009-01 <chr> "3.014", "338.532", "141.448", "461.004", "215.66", "3...
## $ 2009-02 <chr> "3.196", "360.412", "143.529", "482.407", "228.737", "...
## $ 2009-03 <dbl> 3.330, 353.686, 142.893, 467.224, 222.844, 2.586, 6.67...
## $ 2009-04 <chr> "4.049", "359.38", "142.34", "493.152", "238.232", "2....
## $ 2009-05 <chr> "4.119", "354.75", "144.225", "475.634", "224.962", "2...
## $ 2009-06 <dbl> 4.900, 347.865, 142.006, 473.099, 226.259, 2.763, 6.79...
## $ 2009-07 <chr> "6.444", "339.477", "137.691", "470.828", "230.308", "...
## $ 2009-08 <chr> "5.903", "332.661", "139.158", "466.676", "231.783", "...
## $ 2009-09 <dbl> 4.696, 374.260, 139.087, 500.403, 250.922, 2.716, 6.69...
## $ 2009-10 <chr> "4.212", "385.868", "137.104", "513.406", "230.739", "...
## $ 2009-11 <chr> "3.576", "366.98", "129.343", "480.278", "214.711", "2...
## $ 2009-12 <dbl> 3.113, 332.394, 126.066, 440.925, 194.446, 2.365, 6.45...
## $ 2010-01 <chr> "3.207", "362.226", "130.91", "464.069", "204.396", "2...
## $ 2010-02 <chr> "3.195", "361.138", "131.918", "480.121", "213.136", "...
## $ 2010-03 <dbl> 3.481, 373.443, 131.252, 483.397, 211.693, 2.550, 7.06...
## $ 2010-04 <chr> "4.452", "378.611", "131.722", "502.374", "227.246", "...
## $ 2010-05 <chr> "4.415", "380.171", "128.8", "487.4", "217.805", "2.68...
## $ 2010-06 <dbl> 5.411, 363.275, 129.144, 490.263, 215.922, 2.646, 7.46...
## $ 2010-07 <chr> "6.513", "353.04", "122.935", "488.587", "218.729", "2...
## $ 2010-08 <chr> "6.269", "343.688", "129.732", "473.731", "210.53", "2...
## $ 2010-09 <dbl> 4.699, 381.622, 132.892, 521.099, 236.368, 2.652, 7.34...
## $ 2010-10 <chr> "4.402", "384.987", "131.033", "532.403", "236.366", "...
## $ 2010-11 <chr> "3.731", "367.955", "130.889", "502.887", "221.881", "...
## $ 2010-12 <dbl> 3.156, 326.338, 121.422, 450.433, 196.211, 2.379, 7.19...
## $ 2011-01 <chr> "3.14", "334.958", "128.396", "468.418", "198.45", "2....
## $ 2011-02 <chr> "3.284", "346.234", "125.463", "504.068", "219.886", "...
## $ 2011-03 <dbl> 3.674, 380.399, 134.374, 516.730, 227.935, 2.559, 8.38...
## $ 2011-04 <chr> "4.251", "380.446", "134.169", "528.631", "242.28", "2...
## $ 2011-05 <chr> "4.431", "385.289", "136.14", "528.122", "225.776", "2...
## $ 2011-06 <dbl> 5.474, 376.317, 135.581, 529.528, 221.865, 2.815, 8.37...
## $ 2011-07 <chr> "6.581", "361.585", "132.41", "532.888", "231.01", "2....
## $ 2011-08 <chr> "6.733", "353.793", "130.616", "508.145", "220.164", "...
## $ 2011-09 <dbl> 5.003, 388.271, 136.901, 550.137, 244.949, 2.843, 8.31...
## $ 2011-10 <chr> "4.484", "398.456", "128.72", "554.932", "237.768", "2...
# Remove the first column of mbta2: mbta3
mbta3 <- mbta2[,-1]
glimpse(mbta3)
## Observations: 8
## Variables: 59
## $ mode    <chr> "Boat", "Bus", "Commuter Rail", "Heavy Rail", "Light R...
## $ 2007-01 <chr> "4", "335.819", "142.2", "435.294", "227.231", "4.772"...
## $ 2007-02 <chr> "3.6", "338.675", "138.5", "448.271", "240.262", "4.41...
## $ 2007-03 <dbl> 40.000, 339.867, 137.700, 458.583, 241.444, 4.574, 5.5...
## $ 2007-04 <chr> "4.3", "352.162", "139.5", "472.201", "255.557", "4.54...
## $ 2007-05 <chr> "4.9", "354.367", "139", "474.579", "248.262", "4.768"...
## $ 2007-06 <dbl> 5.800, 350.543, 143.000, 477.032, 246.108, 4.722, 5.60...
## $ 2007-07 <chr> "6.521", "357.519", "142.391", "471.735", "243.286", "...
## $ 2007-08 <chr> "6.572", "355.479", "142.364", "461.605", "234.907", "...
## $ 2007-09 <dbl> 5.469, 372.598, 143.051, 499.566, 265.748, 4.329, 5.60...
## $ 2007-10 <chr> "5.145", "368.847", "146.542", "457.741", "241.434", "...
## $ 2007-11 <chr> "3.763", "330.826", "145.089", "488.348", "250.497", "...
## $ 2007-12 <dbl> 2.985, 312.920, 141.585, 448.268, 233.379, 3.708, 5.06...
## $ 2008-01 <chr> "3.175", "340.324", "142.145", "472.624", "241.223", "...
## $ 2008-02 <chr> "3.111", "352.905", "142.607", "492.1", "249.306", "4....
## $ 2008-03 <dbl> 3.512, 361.155, 137.453, 494.046, 253.132, 4.175, 6.16...
## $ 2008-04 <chr> "4.164", "368.189", "140.389", "513.204", "271.07", "4...
## $ 2008-05 <chr> "4.015", "363.903", "142.585", "507.952", "258.351", "...
## $ 2008-06 <dbl> 5.189, 362.962, 142.057, 518.349, 266.961, 4.400, 6.29...
## $ 2008-07 <chr> "6.016", "370.921", "145.731", "512.309", "270.158", "...
## $ 2008-08 <chr> "5.8", "361.057", "144.565", "476.99", "239.344", "4.2...
## $ 2008-09 <dbl> 4.587, 389.537, 141.907, 517.324, 258.171, 4.878, 6.50...
## $ 2008-10 <chr> "4.285", "357.974", "151.957", "523.644", "250.063", "...
## $ 2008-11 <chr> "3.488", "345.423", "152.952", "487.115", "232.068", "...
## $ 2008-12 <dbl> 3.007, 325.767, 140.810, 446.743, 205.420, 3.420, 5.95...
## $ 2009-01 <chr> "3.014", "338.532", "141.448", "461.004", "215.66", "3...
## $ 2009-02 <chr> "3.196", "360.412", "143.529", "482.407", "228.737", "...
## $ 2009-03 <dbl> 3.330, 353.686, 142.893, 467.224, 222.844, 2.586, 6.67...
## $ 2009-04 <chr> "4.049", "359.38", "142.34", "493.152", "238.232", "2....
## $ 2009-05 <chr> "4.119", "354.75", "144.225", "475.634", "224.962", "2...
## $ 2009-06 <dbl> 4.900, 347.865, 142.006, 473.099, 226.259, 2.763, 6.79...
## $ 2009-07 <chr> "6.444", "339.477", "137.691", "470.828", "230.308", "...
## $ 2009-08 <chr> "5.903", "332.661", "139.158", "466.676", "231.783", "...
## $ 2009-09 <dbl> 4.696, 374.260, 139.087, 500.403, 250.922, 2.716, 6.69...
## $ 2009-10 <chr> "4.212", "385.868", "137.104", "513.406", "230.739", "...
## $ 2009-11 <chr> "3.576", "366.98", "129.343", "480.278", "214.711", "2...
## $ 2009-12 <dbl> 3.113, 332.394, 126.066, 440.925, 194.446, 2.365, 6.45...
## $ 2010-01 <chr> "3.207", "362.226", "130.91", "464.069", "204.396", "2...
## $ 2010-02 <chr> "3.195", "361.138", "131.918", "480.121", "213.136", "...
## $ 2010-03 <dbl> 3.481, 373.443, 131.252, 483.397, 211.693, 2.550, 7.06...
## $ 2010-04 <chr> "4.452", "378.611", "131.722", "502.374", "227.246", "...
## $ 2010-05 <chr> "4.415", "380.171", "128.8", "487.4", "217.805", "2.68...
## $ 2010-06 <dbl> 5.411, 363.275, 129.144, 490.263, 215.922, 2.646, 7.46...
## $ 2010-07 <chr> "6.513", "353.04", "122.935", "488.587", "218.729", "2...
## $ 2010-08 <chr> "6.269", "343.688", "129.732", "473.731", "210.53", "2...
## $ 2010-09 <dbl> 4.699, 381.622, 132.892, 521.099, 236.368, 2.652, 7.34...
## $ 2010-10 <chr> "4.402", "384.987", "131.033", "532.403", "236.366", "...
## $ 2010-11 <chr> "3.731", "367.955", "130.889", "502.887", "221.881", "...
## $ 2010-12 <dbl> 3.156, 326.338, 121.422, 450.433, 196.211, 2.379, 7.19...
## $ 2011-01 <chr> "3.14", "334.958", "128.396", "468.418", "198.45", "2....
## $ 2011-02 <chr> "3.284", "346.234", "125.463", "504.068", "219.886", "...
## $ 2011-03 <dbl> 3.674, 380.399, 134.374, 516.730, 227.935, 2.559, 8.38...
## $ 2011-04 <chr> "4.251", "380.446", "134.169", "528.631", "242.28", "2...
## $ 2011-05 <chr> "4.431", "385.289", "136.14", "528.122", "225.776", "2...
## $ 2011-06 <dbl> 5.474, 376.317, 135.581, 529.528, 221.865, 2.815, 8.37...
## $ 2011-07 <chr> "6.581", "361.585", "132.41", "532.888", "231.01", "2....
## $ 2011-08 <chr> "6.733", "353.793", "130.616", "508.145", "220.164", "...
## $ 2011-09 <dbl> 5.003, 388.271, 136.901, 550.137, 244.949, 2.843, 8.31...
## $ 2011-10 <chr> "4.484", "398.456", "128.72", "554.932", "237.768", "2...

Observations are stored in columns

  • This data is pretty much stored backwards
  • The observation of rider count is being made each month for differnt transit types
    • The month is the observation unique key and should be a column value
  • The mode of transportation could be a variable and be in one column or it could be spread and have one column per type
## mbta3 is pre-loaded
head(mbta3)
## # A tibble: 6 x 59
##            mode `2007-01` `2007-02` `2007-03` `2007-04` `2007-05`
##           <chr>     <chr>     <chr>     <dbl>     <chr>     <chr>
## 1          Boat         4       3.6    40.000       4.3       4.9
## 2           Bus   335.819   338.675   339.867   352.162   354.367
## 3 Commuter Rail     142.2     138.5   137.700     139.5       139
## 4    Heavy Rail   435.294   448.271   458.583   472.201   474.579
## 5    Light Rail   227.231   240.262   241.444   255.557   248.262
## 6   Private Bus     4.772     4.417     4.574     4.542     4.768
## # ... with 53 more variables: `2007-06` <dbl>, `2007-07` <chr>,
## #   `2007-08` <chr>, `2007-09` <dbl>, `2007-10` <chr>, `2007-11` <chr>,
## #   `2007-12` <dbl>, `2008-01` <chr>, `2008-02` <chr>, `2008-03` <dbl>,
## #   `2008-04` <chr>, `2008-05` <chr>, `2008-06` <dbl>, `2008-07` <chr>,
## #   `2008-08` <chr>, `2008-09` <dbl>, `2008-10` <chr>, `2008-11` <chr>,
## #   `2008-12` <dbl>, `2009-01` <chr>, `2009-02` <chr>, `2009-03` <dbl>,
## #   `2009-04` <chr>, `2009-05` <chr>, `2009-06` <dbl>, `2009-07` <chr>,
## #   `2009-08` <chr>, `2009-09` <dbl>, `2009-10` <chr>, `2009-11` <chr>,
## #   `2009-12` <dbl>, `2010-01` <chr>, `2010-02` <chr>, `2010-03` <dbl>,
## #   `2010-04` <chr>, `2010-05` <chr>, `2010-06` <dbl>, `2010-07` <chr>,
## #   `2010-08` <chr>, `2010-09` <dbl>, `2010-10` <chr>, `2010-11` <chr>,
## #   `2010-12` <dbl>, `2011-01` <chr>, `2011-02` <chr>, `2011-03` <dbl>,
## #   `2011-04` <chr>, `2011-05` <chr>, `2011-06` <dbl>, `2011-07` <chr>,
## #   `2011-08` <chr>, `2011-09` <dbl>, `2011-10` <chr>
# Load tidyr
library(tidyr)

# Gather columns of mbta3: mbta4
mbta4 <- gather(mbta3, month, thou_riders, -mode)

# View the head of mbta4
head(mbta4)
## # A tibble: 6 x 3
##            mode   month thou_riders
##           <chr>   <chr>       <chr>
## 1          Boat 2007-01           4
## 2           Bus 2007-01     335.819
## 3 Commuter Rail 2007-01       142.2
## 4    Heavy Rail 2007-01     435.294
## 5    Light Rail 2007-01     227.231
## 6   Private Bus 2007-01       4.772

Type conversions

## mbta4 is pre-loaded
head(mbta4)
## # A tibble: 6 x 3
##            mode   month thou_riders
##           <chr>   <chr>       <chr>
## 1          Boat 2007-01           4
## 2           Bus 2007-01     335.819
## 3 Commuter Rail 2007-01       142.2
## 4    Heavy Rail 2007-01     435.294
## 5    Light Rail 2007-01     227.231
## 6   Private Bus 2007-01       4.772
# Coerce thou_riders to numeric
mbta4$thou_riders <- as.numeric(mbta4$thou_riders)

Variables are stored in both rows and columns

  • Here they want to spread all the modes of transportaion into separate columns
  • I actually like it in a column for these reasons…
    • I think of the variables as month, trasportation mode, and rider count.
    • If doing a group by and summarize it needs to be in a column to group by type
    • Also ggplot will require it in a column
  • But The exercise is to spread it out. Though they make a chart and have to re-melt it later! : )
    • But its not a big deal. Its easy to gather or spread this info based on whats needed for analysis.
## tidyr is pre-loaded

# Spread the contents of mbta4: mbta5
head(mbta4)
## # A tibble: 6 x 3
##            mode   month thou_riders
##           <chr>   <chr>       <dbl>
## 1          Boat 2007-01       4.000
## 2           Bus 2007-01     335.819
## 3 Commuter Rail 2007-01     142.200
## 4    Heavy Rail 2007-01     435.294
## 5    Light Rail 2007-01     227.231
## 6   Private Bus 2007-01       4.772
mbta5 <- spread(mbta4, mode, thou_riders)

# View the head of mbta5
head(mbta5)
## # A tibble: 6 x 9
##     month  Boat     Bus `Commuter Rail` `Heavy Rail` `Light Rail`
##     <chr> <dbl>   <dbl>           <dbl>        <dbl>        <dbl>
## 1 2007-01   4.0 335.819           142.2      435.294      227.231
## 2 2007-02   3.6 338.675           138.5      448.271      240.262
## 3 2007-03  40.0 339.867           137.7      458.583      241.444
## 4 2007-04   4.3 352.162           139.5      472.201      255.557
## 5 2007-05   4.9 354.367           139.0      474.579      248.262
## 6 2007-06   5.8 350.543           143.0      477.032      246.108
## # ... with 3 more variables: `Private Bus` <dbl>, RIDE <dbl>, `Trackless
## #   Trolley` <dbl>

Separating columns

# View the head of mbta5
head(mbta5)
## # A tibble: 6 x 9
##     month  Boat     Bus `Commuter Rail` `Heavy Rail` `Light Rail`
##     <chr> <dbl>   <dbl>           <dbl>        <dbl>        <dbl>
## 1 2007-01   4.0 335.819           142.2      435.294      227.231
## 2 2007-02   3.6 338.675           138.5      448.271      240.262
## 3 2007-03  40.0 339.867           137.7      458.583      241.444
## 4 2007-04   4.3 352.162           139.5      472.201      255.557
## 5 2007-05   4.9 354.367           139.0      474.579      248.262
## 6 2007-06   5.8 350.543           143.0      477.032      246.108
## # ... with 3 more variables: `Private Bus` <dbl>, RIDE <dbl>, `Trackless
## #   Trolley` <dbl>
# Split month column into month and year: mbta6
mbta6 <- separate(mbta5, month, c('year', 'month'))

# View the head of mbta6
head(mbta6)
## # A tibble: 6 x 10
##    year month  Boat     Bus `Commuter Rail` `Heavy Rail` `Light Rail`
##   <chr> <chr> <dbl>   <dbl>           <dbl>        <dbl>        <dbl>
## 1  2007    01   4.0 335.819           142.2      435.294      227.231
## 2  2007    02   3.6 338.675           138.5      448.271      240.262
## 3  2007    03  40.0 339.867           137.7      458.583      241.444
## 4  2007    04   4.3 352.162           139.5      472.201      255.557
## 5  2007    05   4.9 354.367           139.0      474.579      248.262
## 6  2007    06   5.8 350.543           143.0      477.032      246.108
## # ... with 3 more variables: `Private Bus` <dbl>, RIDE <dbl>, `Trackless
## #   Trolley` <dbl>

Do your values seem reasonable?

  • See, I would have just filtered the mode to Boat then made the hist
# View a summary of mbta6
summary(mbta6)
##      year              month                Boat             Bus       
##  Length:58          Length:58          Min.   : 2.985   Min.   :312.9  
##  Class :character   Class :character   1st Qu.: 3.494   1st Qu.:345.6  
##  Mode  :character   Mode  :character   Median : 4.293   Median :359.9  
##                                        Mean   : 5.068   Mean   :358.6  
##                                        3rd Qu.: 5.356   3rd Qu.:372.2  
##                                        Max.   :40.000   Max.   :398.5  
##  Commuter Rail     Heavy Rail      Light Rail     Private Bus   
##  Min.   :121.4   Min.   :435.3   Min.   :194.4   Min.   :2.213  
##  1st Qu.:131.4   1st Qu.:471.1   1st Qu.:220.6   1st Qu.:2.641  
##  Median :138.8   Median :487.3   Median :231.9   Median :2.820  
##  Mean   :137.4   Mean   :489.3   Mean   :233.0   Mean   :3.352  
##  3rd Qu.:142.4   3rd Qu.:511.3   3rd Qu.:244.5   3rd Qu.:4.167  
##  Max.   :153.0   Max.   :554.9   Max.   :271.1   Max.   :4.878  
##       RIDE       Trackless Trolley
##  Min.   :4.900   Min.   : 5.777   
##  1st Qu.:5.965   1st Qu.:11.679   
##  Median :6.615   Median :12.598   
##  Mean   :6.604   Mean   :12.125   
##  3rd Qu.:7.149   3rd Qu.:13.320   
##  Max.   :8.598   Max.   :15.109
# Generate a histogram of Boat ridership
hist(mbta6$Boat)

Dealing with entry error

# Find the row number of the incorrect value: i
i <- which(mbta6$Boat == 40)

# Replace the incorrect value with 4
mbta6$Boat[i] <- 4

# Generate a histogram of Boat column
hist(mbta6$Boat)

Extra plots

  • As a teaser for a later class I guess, they threw in some ggplot charts at the very end of this exercise
    • They use a couple dataframes that they have made themselves.
    • I will need to get dplyr and ggplot and make the same data frames they use here to copy these charts
    • And ironically I will need to gather the data back by mode, and reunite the year month. : )
library(dplyr)
library(ggplot2)

## need to use the old mbta data frame because ggplot likes the variables to be melted
head(mbta6)
## # A tibble: 6 x 10
##    year month  Boat     Bus `Commuter Rail` `Heavy Rail` `Light Rail`
##   <chr> <chr> <dbl>   <dbl>           <dbl>        <dbl>        <dbl>
## 1  2007    01   4.0 335.819           142.2      435.294      227.231
## 2  2007    02   3.6 338.675           138.5      448.271      240.262
## 3  2007    03   4.0 339.867           137.7      458.583      241.444
## 4  2007    04   4.3 352.162           139.5      472.201      255.557
## 5  2007    05   4.9 354.367           139.0      474.579      248.262
## 6  2007    06   5.8 350.543           143.0      477.032      246.108
## # ... with 3 more variables: `Private Bus` <dbl>, RIDE <dbl>, `Trackless
## #   Trolley` <dbl>
table(mbta4$mode)
## 
##              Boat               Bus     Commuter Rail        Heavy Rail 
##                58                58                58                58 
##        Light Rail       Private Bus              RIDE Trackless Trolley 
##                58                58                58                58
mbta_all <- mbta6 %>%
  unite(year_mon, year, month, sep = "") %>%
  gather(mode, thou_riders, -year_mon)
  
mbta_boat <- mbta_all %>%
  filter(mode %in% c("Boat","Trackless Trolley"))

head(mbta_boat)
## # A tibble: 6 x 3
##   year_mon  mode thou_riders
##      <chr> <chr>       <dbl>
## 1   200701  Boat         4.0
## 2   200702  Boat         3.6
## 3   200703  Boat         4.0
## 4   200704  Boat         4.3
## 5   200705  Boat         4.9
## 6   200706  Boat         5.8
table(mbta_boat$mode)
## 
##              Boat Trackless Trolley 
##                58                58
# Look at Boat and Trackless Trolley ridership over time (don't change)
## The old outlier point for boat is still in here
ggplot(mbta_boat, aes(x = year_mon, y = thou_riders, col = mode)) +  geom_point() + 
  scale_x_discrete(name = "Month", breaks = c(200701, 200801, 200901, 201001, 201101)) + 
  scale_y_continuous(name = "Avg Weekday Ridership (thousands)")

# Look at all T ridership over time (don't change)
ggplot(mbta_all, aes(x = year_mon, y = thou_riders, col = mode)) + geom_point() + 
  scale_x_discrete(name = "Month", breaks = c(200701, 200801, 200901, 201001, 201101)) +  
  scale_y_continuous(name = "Avg Weekday Ridership (thousands)")

   


World Food Facts


Importing the data

# Load data.table
library(data.table)

# Import food.csv: food
url_food <- 'http://s3.amazonaws.com/assets.datacamp.com/production/course_1294/datasets/food.csv'
food <- fread(url_food)

# Convert food to a data frame
food <- data.frame(food)

Examining the data

# Loaded dplyr

# View a glimpse of food
glimpse(food)
## Observations: 1,500
## Variables: 160
## $ V1                                         <chr> "1", "2", "3", "4",...
## $ code                                       <dbl> 1.592115e-311, 2.67...
## $ url                                        <chr> "http://world-en.op...
## $ creator                                    <chr> "sebleouf", "foodor...
## $ created_t                                  <chr> "1424747544", "1450...
## $ created_datetime                           <chr> "2015-02-24T03:12:2...
## $ last_modified_t                            <chr> "1438445887", "1450...
## $ last_modified_datetime                     <chr> "2015-08-01T16:18:0...
## $ product_name                               <chr> "Confiture de frais...
## $ generic_name                               <chr> "", "", "Pâtes de f...
## $ quantity                                   <chr> "265 g", "375g", "1...
## $ packaging                                  <chr> "Bocal,Verre", "Pla...
## $ packaging_tags                             <chr> "bocal,verre", "pla...
## $ brands                                     <chr> "Casino Délices", "...
## $ brands_tags                                <chr> "casino-delices", "...
## $ categories                                 <chr> "Aliments et boisso...
## $ categories_tags                            <chr> "en:plant-based-foo...
## $ categories_en                              <chr> "Plant-based foods ...
## $ origins                                    <chr> "", "", "", "", "Ar...
## $ origins_tags                               <chr> "", "", "", "", "ar...
## $ manufacturing_places                       <chr> "France", "Belgium"...
## $ manufacturing_places_tags                  <chr> "france", "belgium"...
## $ labels                                     <chr> "", "", "", "Vegeta...
## $ labels_tags                                <chr> "", "", "", "en:veg...
## $ labels_en                                  <chr> "", "", "", "Vegeta...
## $ emb_codes                                  <chr> "EMB 78015", "", ""...
## $ emb_codes_tags                             <chr> "emb-78015", "", ""...
## $ first_packaging_code_geo                   <chr> "48.983333,2.066667...
## $ cities                                     <chr> "", "", "", "", "",...
## $ cities_tags                                <chr> "andresy-yvelines-f...
## $ purchase_places                            <chr> "Lyon,France", "NSW...
## $ stores                                     <chr> "Casino", "", "", "...
## $ countries                                  <chr> "France", "Australi...
## $ countries_tags                             <chr> "en:france", "en:au...
## $ countries_en                               <chr> "France", "Australi...
## $ ingredients_text                           <chr> "Sucre de canne, fr...
## $ allergens                                  <chr> "", "", "", "", "",...
## $ allergens_en                               <chr> "", "", "", "", "",...
## $ traces                                     <chr> "Lait,Fruits à coqu...
## $ traces_tags                                <chr> "en:milk,en:nuts", ...
## $ traces_en                                  <chr> "Milk,Nuts", "", ""...
## $ serving_size                               <chr> "15 g", "", "", "",...
## $ no_nutriments                              <lgl> NA, NA, NA, NA, NA,...
## $ additives_n                                <int> 1, NA, 2, 5, 0, NA,...
## $ additives                                  <chr> "[ sucre-de-canne -...
## $ additives_tags                             <chr> "en:e440", "", "en:...
## $ additives_en                               <chr> "E440 - Pectins", "...
## $ ingredients_from_palm_oil_n                <int> 0, NA, 0, 0, 0, NA,...
## $ ingredients_from_palm_oil                  <lgl> NA, NA, NA, NA, NA,...
## $ ingredients_from_palm_oil_tags             <chr> "", "", "", "", "",...
## $ ingredients_that_may_be_from_palm_oil_n    <int> 0, NA, 0, 1, 0, NA,...
## $ ingredients_that_may_be_from_palm_oil      <lgl> NA, NA, NA, NA, NA,...
## $ ingredients_that_may_be_from_palm_oil_tags <chr> "", "", "", "e471-m...
## $ nutrition_grade_uk                         <lgl> NA, NA, NA, NA, NA,...
## $ nutrition_grade_fr                         <chr> "d", "", "", "d", "...
## $ pnns_groups_1                              <chr> "Sugary snacks", "S...
## $ pnns_groups_2                              <chr> "Sweets", "Chocolat...
## $ states                                     <chr> "en:to-be-checked, ...
## $ states_tags                                <chr> "en:to-be-checked,e...
## $ states_en                                  <chr> "To be checked,Comp...
## $ main_category                              <chr> "en:plant-based-foo...
## $ main_category_en                           <chr> "Plant-based foods ...
## $ image_url                                  <chr> "http://en.openfood...
## $ image_small_url                            <chr> "http://en.openfood...
## $ energy_100g                                <dbl> 918, NA, NA, 766, 2...
## $ energy_from_fat_100g                       <dbl> NA, NA, NA, NA, NA,...
## $ fat_100g                                   <dbl> 0.00, NA, NA, 16.70...
## $ saturated_fat_100g                         <dbl> 0.000, NA, NA, 9.90...
## $ butyric_acid_100g                          <lgl> NA, NA, NA, NA, NA,...
## $ caproic_acid_100g                          <lgl> NA, NA, NA, NA, NA,...
## $ caprylic_acid_100g                         <lgl> NA, NA, NA, NA, NA,...
## $ capric_acid_100g                           <lgl> NA, NA, NA, NA, NA,...
## $ lauric_acid_100g                           <lgl> NA, NA, NA, NA, NA,...
## $ myristic_acid_100g                         <lgl> NA, NA, NA, NA, NA,...
## $ palmitic_acid_100g                         <lgl> NA, NA, NA, NA, NA,...
## $ stearic_acid_100g                          <lgl> NA, NA, NA, NA, NA,...
## $ arachidic_acid_100g                        <lgl> NA, NA, NA, NA, NA,...
## $ behenic_acid_100g                          <lgl> NA, NA, NA, NA, NA,...
## $ lignoceric_acid_100g                       <lgl> NA, NA, NA, NA, NA,...
## $ cerotic_acid_100g                          <lgl> NA, NA, NA, NA, NA,...
## $ montanic_acid_100g                         <lgl> NA, NA, NA, NA, NA,...
## $ melissic_acid_100g                         <lgl> NA, NA, NA, NA, NA,...
## $ monounsaturated_fat_100g                   <dbl> NA, NA, NA, 2.9, 9....
## $ polyunsaturated_fat_100g                   <dbl> NA, NA, NA, 3.9, 32...
## $ omega_3_fat_100g                           <dbl> NA, NA, NA, NA, NA,...
## $ alpha_linolenic_acid_100g                  <dbl> NA, NA, NA, NA, NA,...
## $ eicosapentaenoic_acid_100g                 <dbl> NA, NA, NA, NA, NA,...
## $ docosahexaenoic_acid_100g                  <dbl> NA, NA, NA, NA, NA,...
## $ omega_6_fat_100g                           <dbl> NA, NA, NA, NA, NA,...
## $ linoleic_acid_100g                         <dbl> NA, NA, NA, NA, NA,...
## $ arachidonic_acid_100g                      <lgl> NA, NA, NA, NA, NA,...
## $ gamma_linolenic_acid_100g                  <lgl> NA, NA, NA, NA, NA,...
## $ dihomo_gamma_linolenic_acid_100g           <lgl> NA, NA, NA, NA, NA,...
## $ omega_9_fat_100g                           <lgl> NA, NA, NA, NA, NA,...
## $ oleic_acid_100g                            <lgl> NA, NA, NA, NA, NA,...
## $ elaidic_acid_100g                          <lgl> NA, NA, NA, NA, NA,...
## $ gondoic_acid_100g                          <lgl> NA, NA, NA, NA, NA,...
## $ mead_acid_100g                             <lgl> NA, NA, NA, NA, NA,...
## $ erucic_acid_100g                           <lgl> NA, NA, NA, NA, NA,...
## $ nervonic_acid_100g                         <lgl> NA, NA, NA, NA, NA,...
## $ trans_fat_100g                             <dbl> NA, NA, NA, NA, NA,...
## $ cholesterol_100g                           <dbl> NA, NA, NA, 0.00020...
## $ carbohydrates_100g                         <dbl> 54.00, NA, NA, 5.70...
## $ sugars_100g                                <dbl> 54.00, NA, NA, 4.20...
## $ sucrose_100g                               <lgl> NA, NA, NA, NA, NA,...
## $ glucose_100g                               <lgl> NA, NA, NA, NA, NA,...
## $ fructose_100g                              <int> NA, NA, NA, NA, NA,...
## $ lactose_100g                               <dbl> NA, NA, NA, NA, NA,...
## $ maltose_100g                               <lgl> NA, NA, NA, NA, NA,...
## $ maltodextrins_100g                         <lgl> NA, NA, NA, NA, NA,...
## $ starch_100g                                <dbl> NA, NA, NA, NA, NA,...
## $ polyols_100g                               <dbl> NA, NA, NA, NA, NA,...
## $ fiber_100g                                 <dbl> NA, NA, NA, 0.2, 9....
## $ proteins_100g                              <dbl> 0.00, NA, NA, 2.90,...
## $ casein_100g                                <dbl> NA, NA, NA, NA, NA,...
## $ serum_proteins_100g                        <lgl> NA, NA, NA, NA, NA,...
## $ nucleotides_100g                           <lgl> NA, NA, NA, NA, NA,...
## $ salt_100g                                  <dbl> 0.0000000, NA, NA, ...
## $ sodium_100g                                <dbl> 0.0000000, NA, NA, ...
## $ alcohol_100g                               <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_a_100g                             <dbl> NA, NA, NA, NA, NA,...
## $ beta_carotene_100g                         <lgl> NA, NA, NA, NA, NA,...
## $ vitamin_d_100g                             <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_e_100g                             <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_k_100g                             <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_c_100g                             <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_b1_100g                            <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_b2_100g                            <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_pp_100g                            <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_b6_100g                            <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_b9_100g                            <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_b12_100g                           <dbl> NA, NA, NA, NA, NA,...
## $ biotin_100g                                <dbl> NA, NA, NA, NA, NA,...
## $ pantothenic_acid_100g                      <dbl> NA, NA, NA, NA, NA,...
## $ silica_100g                                <dbl> NA, NA, NA, NA, NA,...
## $ bicarbonate_100g                           <dbl> NA, NA, NA, NA, NA,...
## $ potassium_100g                             <dbl> NA, NA, NA, NA, NA,...
## $ chloride_100g                              <dbl> NA, NA, NA, NA, NA,...
## $ calcium_100g                               <dbl> NA, NA, NA, NA, NA,...
## $ phosphorus_100g                            <dbl> NA, NA, NA, NA, 1.1...
## $ iron_100g                                  <dbl> NA, NA, NA, NA, 0.0...
## $ magnesium_100g                             <dbl> NA, NA, NA, NA, 0.1...
## $ zinc_100g                                  <dbl> NA, NA, NA, NA, NA,...
## $ copper_100g                                <dbl> NA, NA, NA, NA, NA,...
## $ manganese_100g                             <dbl> NA, NA, NA, NA, NA,...
## $ fluoride_100g                              <dbl> NA, NA, NA, NA, NA,...
## $ selenium_100g                              <dbl> NA, NA, NA, NA, NA,...
## $ chromium_100g                              <lgl> NA, NA, NA, NA, NA,...
## $ molybdenum_100g                            <lgl> NA, NA, NA, NA, NA,...
## $ iodine_100g                                <dbl> NA, NA, NA, NA, NA,...
## $ caffeine_100g                              <lgl> NA, NA, NA, NA, NA,...
## $ taurine_100g                               <lgl> NA, NA, NA, NA, NA,...
## $ ph_100g                                    <lgl> NA, NA, NA, NA, NA,...
## $ fruits_vegetables_nuts_100g                <dbl> 54, NA, NA, NA, NA,...
## $ collagen_meat_protein_ratio_100g           <int> NA, NA, NA, NA, NA,...
## $ cocoa_100g                                 <int> NA, NA, NA, NA, NA,...
## $ chlorophyl_100g                            <lgl> NA, NA, NA, NA, NA,...
## $ carbon_footprint_100g                      <dbl> NA, NA, NA, NA, NA,...
## $ nutrition_score_fr_100g                    <int> 11, NA, NA, 11, 17,...
## $ nutrition_score_uk_100g                    <int> 11, NA, NA, 11, 17,...
# View column names of food
names(food)
##   [1] "V1"                                        
##   [2] "code"                                      
##   [3] "url"                                       
##   [4] "creator"                                   
##   [5] "created_t"                                 
##   [6] "created_datetime"                          
##   [7] "last_modified_t"                           
##   [8] "last_modified_datetime"                    
##   [9] "product_name"                              
##  [10] "generic_name"                              
##  [11] "quantity"                                  
##  [12] "packaging"                                 
##  [13] "packaging_tags"                            
##  [14] "brands"                                    
##  [15] "brands_tags"                               
##  [16] "categories"                                
##  [17] "categories_tags"                           
##  [18] "categories_en"                             
##  [19] "origins"                                   
##  [20] "origins_tags"                              
##  [21] "manufacturing_places"                      
##  [22] "manufacturing_places_tags"                 
##  [23] "labels"                                    
##  [24] "labels_tags"                               
##  [25] "labels_en"                                 
##  [26] "emb_codes"                                 
##  [27] "emb_codes_tags"                            
##  [28] "first_packaging_code_geo"                  
##  [29] "cities"                                    
##  [30] "cities_tags"                               
##  [31] "purchase_places"                           
##  [32] "stores"                                    
##  [33] "countries"                                 
##  [34] "countries_tags"                            
##  [35] "countries_en"                              
##  [36] "ingredients_text"                          
##  [37] "allergens"                                 
##  [38] "allergens_en"                              
##  [39] "traces"                                    
##  [40] "traces_tags"                               
##  [41] "traces_en"                                 
##  [42] "serving_size"                              
##  [43] "no_nutriments"                             
##  [44] "additives_n"                               
##  [45] "additives"                                 
##  [46] "additives_tags"                            
##  [47] "additives_en"                              
##  [48] "ingredients_from_palm_oil_n"               
##  [49] "ingredients_from_palm_oil"                 
##  [50] "ingredients_from_palm_oil_tags"            
##  [51] "ingredients_that_may_be_from_palm_oil_n"   
##  [52] "ingredients_that_may_be_from_palm_oil"     
##  [53] "ingredients_that_may_be_from_palm_oil_tags"
##  [54] "nutrition_grade_uk"                        
##  [55] "nutrition_grade_fr"                        
##  [56] "pnns_groups_1"                             
##  [57] "pnns_groups_2"                             
##  [58] "states"                                    
##  [59] "states_tags"                               
##  [60] "states_en"                                 
##  [61] "main_category"                             
##  [62] "main_category_en"                          
##  [63] "image_url"                                 
##  [64] "image_small_url"                           
##  [65] "energy_100g"                               
##  [66] "energy_from_fat_100g"                      
##  [67] "fat_100g"                                  
##  [68] "saturated_fat_100g"                        
##  [69] "butyric_acid_100g"                         
##  [70] "caproic_acid_100g"                         
##  [71] "caprylic_acid_100g"                        
##  [72] "capric_acid_100g"                          
##  [73] "lauric_acid_100g"                          
##  [74] "myristic_acid_100g"                        
##  [75] "palmitic_acid_100g"                        
##  [76] "stearic_acid_100g"                         
##  [77] "arachidic_acid_100g"                       
##  [78] "behenic_acid_100g"                         
##  [79] "lignoceric_acid_100g"                      
##  [80] "cerotic_acid_100g"                         
##  [81] "montanic_acid_100g"                        
##  [82] "melissic_acid_100g"                        
##  [83] "monounsaturated_fat_100g"                  
##  [84] "polyunsaturated_fat_100g"                  
##  [85] "omega_3_fat_100g"                          
##  [86] "alpha_linolenic_acid_100g"                 
##  [87] "eicosapentaenoic_acid_100g"                
##  [88] "docosahexaenoic_acid_100g"                 
##  [89] "omega_6_fat_100g"                          
##  [90] "linoleic_acid_100g"                        
##  [91] "arachidonic_acid_100g"                     
##  [92] "gamma_linolenic_acid_100g"                 
##  [93] "dihomo_gamma_linolenic_acid_100g"          
##  [94] "omega_9_fat_100g"                          
##  [95] "oleic_acid_100g"                           
##  [96] "elaidic_acid_100g"                         
##  [97] "gondoic_acid_100g"                         
##  [98] "mead_acid_100g"                            
##  [99] "erucic_acid_100g"                          
## [100] "nervonic_acid_100g"                        
## [101] "trans_fat_100g"                            
## [102] "cholesterol_100g"                          
## [103] "carbohydrates_100g"                        
## [104] "sugars_100g"                               
## [105] "sucrose_100g"                              
## [106] "glucose_100g"                              
## [107] "fructose_100g"                             
## [108] "lactose_100g"                              
## [109] "maltose_100g"                              
## [110] "maltodextrins_100g"                        
## [111] "starch_100g"                               
## [112] "polyols_100g"                              
## [113] "fiber_100g"                                
## [114] "proteins_100g"                             
## [115] "casein_100g"                               
## [116] "serum_proteins_100g"                       
## [117] "nucleotides_100g"                          
## [118] "salt_100g"                                 
## [119] "sodium_100g"                               
## [120] "alcohol_100g"                              
## [121] "vitamin_a_100g"                            
## [122] "beta_carotene_100g"                        
## [123] "vitamin_d_100g"                            
## [124] "vitamin_e_100g"                            
## [125] "vitamin_k_100g"                            
## [126] "vitamin_c_100g"                            
## [127] "vitamin_b1_100g"                           
## [128] "vitamin_b2_100g"                           
## [129] "vitamin_pp_100g"                           
## [130] "vitamin_b6_100g"                           
## [131] "vitamin_b9_100g"                           
## [132] "vitamin_b12_100g"                          
## [133] "biotin_100g"                               
## [134] "pantothenic_acid_100g"                     
## [135] "silica_100g"                               
## [136] "bicarbonate_100g"                          
## [137] "potassium_100g"                            
## [138] "chloride_100g"                             
## [139] "calcium_100g"                              
## [140] "phosphorus_100g"                           
## [141] "iron_100g"                                 
## [142] "magnesium_100g"                            
## [143] "zinc_100g"                                 
## [144] "copper_100g"                               
## [145] "manganese_100g"                            
## [146] "fluoride_100g"                             
## [147] "selenium_100g"                             
## [148] "chromium_100g"                             
## [149] "molybdenum_100g"                           
## [150] "iodine_100g"                               
## [151] "caffeine_100g"                             
## [152] "taurine_100g"                              
## [153] "ph_100g"                                   
## [154] "fruits_vegetables_nuts_100g"               
## [155] "collagen_meat_protein_ratio_100g"          
## [156] "cocoa_100g"                                
## [157] "chlorophyl_100g"                           
## [158] "carbon_footprint_100g"                     
## [159] "nutrition_score_fr_100g"                   
## [160] "nutrition_score_uk_100g"
  • sheesh, this is a lot of variables

Inspecting variables

  • I really don’t want to print this all out in the doc so I am limiting the variables here
# View summary of food
summary(food[,1:10])
##       V1                 code       url              creator         
##  Length:1500        Min.   :0   Length:1500        Length:1500       
##  Class :character   1st Qu.:0   Class :character   Class :character  
##  Mode  :character   Median :0   Mode  :character   Mode  :character  
##                     Mean   :0                                        
##                     3rd Qu.:0                                        
##                     Max.   :0                                        
##   created_t         created_datetime   last_modified_t   
##  Length:1500        Length:1500        Length:1500       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##  last_modified_datetime product_name       generic_name      
##  Length:1500            Length:1500        Length:1500       
##  Class :character       Class :character   Class :character  
##  Mode  :character       Mode  :character   Mode  :character  
##                                                              
##                                                              
## 
# View head of food
head(food[,1:10])
##   V1          code
## 1  1 1.592115e-311
## 2  2 2.673378e-311
## 3  3 1.613001e-311
## 4  4 3.955509e-311
## 5  5 4.189677e-311
## 6  6 4.333113e-313
##                                                                                                                            url
## 1 http://world-en.openfoodfacts.org/product/3222475745867/confiture-de-fraise-fraise-des-bois-au-sucre-de-canne-casino-delices
## 2                                         http://world-en.openfoodfacts.org/product/5410976880110/guylian-sea-shells-selection
## 3                                  http://world-en.openfoodfacts.org/product/3264750423503/pates-de-fruits-aromatisees-jacquot
## 4                                  http://world-en.openfoodfacts.org/product/8006040247001/nata-vegetal-a-base-de-soja-valsoia
## 5           http://world-en.openfoodfacts.org/product/8480000340764/semillas-de-girasol-con-cascara-tostadas-aguasal-hacendado
## 6                                                           http://world-en.openfoodfacts.org/product/0087703177727/soft-drink
##       creator  created_t     created_datetime last_modified_t
## 1    sebleouf 1424747544 2015-02-24T03:12:24Z      1438445887
## 2 foodorigins 1450316429 2015-12-17T01:40:29Z      1450817956
## 3    domdom26 1428674916 2015-04-10T14:08:36Z      1428739289
## 4     javichu 1420416591 2015-01-05T00:09:51Z      1420417876
## 5     javichu 1420501121 2015-01-05T23:38:41Z      1445700917
## 6 foodorigins 1437983923 2015-07-27T07:58:43Z      1445577476
##   last_modified_datetime
## 1   2015-08-01T16:18:07Z
## 2   2015-12-22T20:59:16Z
## 3   2015-04-11T08:01:29Z
## 4   2015-01-05T00:31:16Z
## 5   2015-10-24T15:35:17Z
## 6   2015-10-23T05:17:56Z
##                                            product_name
## 1 Confiture de fraise fraise des bois au sucre de canne
## 2                          Guylian Sea Shells Selection
## 3                           Pâtes de fruits aromatisées
## 4       Nata vegetal a base de soja &quot;Valsoia&quot;
## 5      Semillas de girasol con cáscara tostadas aguasal
## 6                                            Soft Drink
##                                       generic_name
## 1                                                 
## 2                                                 
## 3                                  Pâtes de fruits
## 4                      Nata vegetal a base de soja
## 5 Semillas de girasol con cáscara tostadas aguasal
## 6
# View structure of food
str(food[,1:10])
## 'data.frame':    1500 obs. of  10 variables:
##  $ V1                    : chr  "1" "2" "3" "4" ...
##  $ code                  : num  1.59e-311 2.67e-311 1.61e-311 3.96e-311 4.19e-311 ...
##  $ url                   : chr  "http://world-en.openfoodfacts.org/product/3222475745867/confiture-de-fraise-fraise-des-bois-au-sucre-de-canne-casino-delices" "http://world-en.openfoodfacts.org/product/5410976880110/guylian-sea-shells-selection" "http://world-en.openfoodfacts.org/product/3264750423503/pates-de-fruits-aromatisees-jacquot" "http://world-en.openfoodfacts.org/product/8006040247001/nata-vegetal-a-base-de-soja-valsoia" ...
##  $ creator               : chr  "sebleouf" "foodorigins" "domdom26" "javichu" ...
##  $ created_t             : chr  "1424747544" "1450316429" "1428674916" "1420416591" ...
##  $ created_datetime      : chr  "2015-02-24T03:12:24Z" "2015-12-17T01:40:29Z" "2015-04-10T14:08:36Z" "2015-01-05T00:09:51Z" ...
##  $ last_modified_t       : chr  "1438445887" "1450817956" "1428739289" "1420417876" ...
##  $ last_modified_datetime: chr  "2015-08-01T16:18:07Z" "2015-12-22T20:59:16Z" "2015-04-11T08:01:29Z" "2015-01-05T00:31:16Z" ...
##  $ product_name          : chr  "Confiture de fraise fraise des bois au sucre de canne" "Guylian Sea Shells Selection" "Pâtes de fruits aromatisées" "Nata vegetal a base de soja &quot;Valsoia&quot;" ...
##  $ generic_name          : chr  "" "" "Pâtes de fruits" "Nata vegetal a base de soja" ...

Removing dupicate info

  • These may be off a bit, this gets rid of the packaging_tags column which is needed later to do the counts of plasti
# Define vector of duplicate cols (don't change)
duplicates <- c(4, 6, 11, 13, 15, 17, 18, 20, 22, 
                24, 25, 28, 32, 34, 36, 38, 40, 
                44, 46, 48, 51, 54, 65, 158)

# Remove duplicates from food: food2
food2 <- food[,-duplicates]

Removing useless info

# Define useless vector (don't change)
useless <- c(1, 2, 3, 32:41)

# Remove useless columns from food2: food3
food3 <- food2[,-useless]

Finding columns

# Create vector of column indices: nutrition
nutrition <- str_detect(names(food3),"100g")

# View a summary of nutrition columns
summary(food3[,nutrition])
##  energy_from_fat_100g    fat_100g      saturated_fat_100g
##  Min.   :   0.00      Min.   :  0.00   Min.   : 0.000    
##  1st Qu.:  35.98      1st Qu.:  0.90   1st Qu.: 0.200    
##  Median : 237.00      Median :  6.00   Median : 1.700    
##  Mean   : 668.41      Mean   : 13.39   Mean   : 4.874    
##  3rd Qu.: 974.00      3rd Qu.: 20.00   3rd Qu.: 6.500    
##  Max.   :2900.00      Max.   :100.00   Max.   :57.000    
##  NA's   :1486         NA's   :708      NA's   :797       
##  butyric_acid_100g caproic_acid_100g caprylic_acid_100g capric_acid_100g
##  Mode:logical      Mode:logical      Mode:logical       Mode:logical    
##  NA's:1500         NA's:1500         NA's:1500          NA's:1500       
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##  lauric_acid_100g myristic_acid_100g palmitic_acid_100g stearic_acid_100g
##  Mode:logical     Mode:logical       Mode:logical       Mode:logical     
##  NA's:1500        NA's:1500          NA's:1500          NA's:1500        
##                                                                          
##                                                                          
##                                                                          
##                                                                          
##                                                                          
##  arachidic_acid_100g behenic_acid_100g lignoceric_acid_100g
##  Mode:logical        Mode:logical      Mode:logical        
##  NA's:1500           NA's:1500         NA's:1500           
##                                                            
##                                                            
##                                                            
##                                                            
##                                                            
##  cerotic_acid_100g montanic_acid_100g melissic_acid_100g
##  Mode:logical      Mode:logical       Mode:logical      
##  NA's:1500         NA's:1500          NA's:1500         
##                                                         
##                                                         
##                                                         
##                                                         
##                                                         
##  monounsaturated_fat_100g polyunsaturated_fat_100g omega_3_fat_100g
##  Min.   : 0.00            Min.   : 0.400           Min.   : 0.033  
##  1st Qu.: 3.87            1st Qu.: 1.653           1st Qu.: 1.300  
##  Median : 9.50            Median : 3.900           Median : 3.000  
##  Mean   :19.77            Mean   : 9.986           Mean   : 3.726  
##  3rd Qu.:29.00            3rd Qu.:12.700           3rd Qu.: 3.200  
##  Max.   :75.00            Max.   :46.200           Max.   :12.400  
##  NA's   :1465             NA's   :1464             NA's   :1491    
##  alpha_linolenic_acid_100g eicosapentaenoic_acid_100g
##  Min.   :0.0800            Min.   :0.721             
##  1st Qu.:0.0905            1st Qu.:0.721             
##  Median :0.1010            Median :0.721             
##  Mean   :0.1737            Mean   :0.721             
##  3rd Qu.:0.2205            3rd Qu.:0.721             
##  Max.   :0.3400            Max.   :0.721             
##  NA's   :1497              NA's   :1499              
##  docosahexaenoic_acid_100g omega_6_fat_100g linoleic_acid_100g
##  Min.   :1.09              Min.   :0.25     Min.   :0.5000    
##  1st Qu.:1.09              1st Qu.:0.25     1st Qu.:0.5165    
##  Median :1.09              Median :0.25     Median :0.5330    
##  Mean   :1.09              Mean   :0.25     Mean   :0.5330    
##  3rd Qu.:1.09              3rd Qu.:0.25     3rd Qu.:0.5495    
##  Max.   :1.09              Max.   :0.25     Max.   :0.5660    
##  NA's   :1499              NA's   :1499     NA's   :1498      
##  arachidonic_acid_100g gamma_linolenic_acid_100g
##  Mode:logical          Mode:logical             
##  NA's:1500             NA's:1500                
##                                                 
##                                                 
##                                                 
##                                                 
##                                                 
##  dihomo_gamma_linolenic_acid_100g omega_9_fat_100g oleic_acid_100g
##  Mode:logical                     Mode:logical     Mode:logical   
##  NA's:1500                        NA's:1500        NA's:1500      
##                                                                   
##                                                                   
##                                                                   
##                                                                   
##                                                                   
##  elaidic_acid_100g gondoic_acid_100g mead_acid_100g erucic_acid_100g
##  Mode:logical      Mode:logical      Mode:logical   Mode:logical    
##  NA's:1500         NA's:1500         NA's:1500      NA's:1500       
##                                                                     
##                                                                     
##                                                                     
##                                                                     
##                                                                     
##  nervonic_acid_100g trans_fat_100g   cholesterol_100g carbohydrates_100g
##  Mode:logical       Min.   :0.0000   Min.   :0.0000   Min.   :  0.000   
##  NA's:1500          1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:  3.792   
##                     Median :0.0000   Median :0.0000   Median : 13.500   
##                     Mean   :0.0105   Mean   :0.0265   Mean   : 27.958   
##                     3rd Qu.:0.0000   3rd Qu.:0.0026   3rd Qu.: 55.000   
##                     Max.   :0.1000   Max.   :0.4300   Max.   :100.000   
##                     NA's   :1481     NA's   :1477     NA's   :708       
##   sugars_100g     sucrose_100g   glucose_100g   fructose_100g 
##  Min.   :  0.00   Mode:logical   Mode:logical   Min.   :100   
##  1st Qu.:  1.00   NA's:1500      NA's:1500      1st Qu.:100   
##  Median :  4.05                                 Median :100   
##  Mean   : 12.66                                 Mean   :100   
##  3rd Qu.: 14.70                                 3rd Qu.:100   
##  Max.   :100.00                                 Max.   :100   
##  NA's   :788                                    NA's   :1499  
##   lactose_100g   maltose_100g   maltodextrins_100g  starch_100g   
##  Min.   :0.000   Mode:logical   Mode:logical       Min.   : 0.00  
##  1st Qu.:0.250   NA's:1500      NA's:1500          1st Qu.: 9.45  
##  Median :0.500                                     Median :39.50  
##  Mean   :2.933                                     Mean   :30.73  
##  3rd Qu.:4.400                                     3rd Qu.:42.85  
##  Max.   :8.300                                     Max.   :71.00  
##  NA's   :1497                                      NA's   :1493   
##   polyols_100g     fiber_100g     proteins_100g     casein_100g  
##  Min.   : 8.60   Min.   : 0.000   Min.   : 0.000   Min.   :1.1   
##  1st Qu.:59.10   1st Qu.: 0.500   1st Qu.: 1.500   1st Qu.:1.1   
##  Median :67.00   Median : 1.750   Median : 6.000   Median :1.1   
##  Mean   :56.06   Mean   : 2.823   Mean   : 7.563   Mean   :1.1   
##  3rd Qu.:69.80   3rd Qu.: 3.500   3rd Qu.:10.675   3rd Qu.:1.1   
##  Max.   :70.00   Max.   :46.700   Max.   :61.000   Max.   :1.1   
##  NA's   :1491    NA's   :994      NA's   :710      NA's   :1499  
##  serum_proteins_100g nucleotides_100g   salt_100g         sodium_100g     
##  Mode:logical        Mode:logical     Min.   :  0.0000   Min.   : 0.0000  
##  NA's:1500           NA's:1500        1st Qu.:  0.0438   1st Qu.: 0.0172  
##                                       Median :  0.4498   Median : 0.1771  
##                                       Mean   :  1.1205   Mean   : 0.4409  
##                                       3rd Qu.:  1.1938   3rd Qu.: 0.4700  
##                                       Max.   :102.0000   Max.   :40.0000  
##                                       NA's   :780        NA's   :780      
##   alcohol_100g   vitamin_a_100g   beta_carotene_100g vitamin_d_100g 
##  Min.   : 0.00   Min.   :0.0000   Mode:logical       Min.   :0e+00  
##  1st Qu.: 0.00   1st Qu.:0.0000   NA's:1500          1st Qu.:0e+00  
##  Median : 5.50   Median :0.0001                      Median :0e+00  
##  Mean   :10.07   Mean   :0.0003                      Mean   :0e+00  
##  3rd Qu.:13.00   3rd Qu.:0.0006                      3rd Qu.:0e+00  
##  Max.   :50.00   Max.   :0.0013                      Max.   :1e-04  
##  NA's   :1433    NA's   :1477                        NA's   :1485   
##  vitamin_e_100g   vitamin_k_100g vitamin_c_100g  vitamin_b1_100g 
##  Min.   :0.0005   Min.   :0      Min.   :0.000   Min.   :0.0001  
##  1st Qu.:0.0021   1st Qu.:0      1st Qu.:0.002   1st Qu.:0.0003  
##  Median :0.0044   Median :0      Median :0.019   Median :0.0004  
##  Mean   :0.0069   Mean   :0      Mean   :0.025   Mean   :0.0006  
##  3rd Qu.:0.0097   3rd Qu.:0      3rd Qu.:0.030   3rd Qu.:0.0010  
##  Max.   :0.0320   Max.   :0      Max.   :0.217   Max.   :0.0013  
##  NA's   :1478     NA's   :1498   NA's   :1459    NA's   :1478    
##  vitamin_b2_100g  vitamin_pp_100g  vitamin_b6_100g  vitamin_b9_100g
##  Min.   :0.0002   Min.   :0.0006   Min.   :0.0001   Min.   :0e+00  
##  1st Qu.:0.0003   1st Qu.:0.0033   1st Qu.:0.0002   1st Qu.:0e+00  
##  Median :0.0009   Median :0.0069   Median :0.0008   Median :1e-04  
##  Mean   :0.0011   Mean   :0.0086   Mean   :0.0112   Mean   :1e-04  
##  3rd Qu.:0.0013   3rd Qu.:0.0140   3rd Qu.:0.0012   3rd Qu.:2e-04  
##  Max.   :0.0066   Max.   :0.0160   Max.   :0.2000   Max.   :2e-04  
##  NA's   :1483     NA's   :1484     NA's   :1481     NA's   :1483   
##  vitamin_b12_100g  biotin_100g   pantothenic_acid_100g  silica_100g   
##  Min.   :0        Min.   :0      Min.   :0.0000        Min.   :8e-04  
##  1st Qu.:0        1st Qu.:0      1st Qu.:0.0007        1st Qu.:8e-04  
##  Median :0        Median :0      Median :0.0020        Median :8e-04  
##  Mean   :0        Mean   :0      Mean   :0.0027        Mean   :8e-04  
##  3rd Qu.:0        3rd Qu.:0      3rd Qu.:0.0051        3rd Qu.:8e-04  
##  Max.   :0        Max.   :0      Max.   :0.0060        Max.   :8e-04  
##  NA's   :1489     NA's   :1498   NA's   :1486          NA's   :1499   
##  bicarbonate_100g potassium_100g   chloride_100g     calcium_100g   
##  Min.   :0.0006   Min.   :0.0000   Min.   :0.0003   Min.   :0.0000  
##  1st Qu.:0.0678   1st Qu.:0.0650   1st Qu.:0.0006   1st Qu.:0.0450  
##  Median :0.1350   Median :0.1940   Median :0.0009   Median :0.1200  
##  Mean   :0.1692   Mean   :0.3288   Mean   :0.0144   Mean   :0.2040  
##  3rd Qu.:0.2535   3rd Qu.:0.3670   3rd Qu.:0.0214   3rd Qu.:0.1985  
##  Max.   :0.3720   Max.   :1.4300   Max.   :0.0420   Max.   :1.0000  
##  NA's   :1497     NA's   :1487     NA's   :1497     NA's   :1449    
##  phosphorus_100g    iron_100g      magnesium_100g     zinc_100g     
##  Min.   :0.0430   Min.   :0.0000   Min.   :0.0000   Min.   :0.0005  
##  1st Qu.:0.1938   1st Qu.:0.0012   1st Qu.:0.0670   1st Qu.:0.0009  
##  Median :0.3185   Median :0.0042   Median :0.1040   Median :0.0017  
##  Mean   :0.3777   Mean   :0.0045   Mean   :0.1066   Mean   :0.0016  
##  3rd Qu.:0.4340   3rd Qu.:0.0077   3rd Qu.:0.1300   3rd Qu.:0.0022  
##  Max.   :1.1550   Max.   :0.0137   Max.   :0.3330   Max.   :0.0026  
##  NA's   :1488     NA's   :1463     NA's   :1479     NA's   :1493    
##   copper_100g    manganese_100g fluoride_100g  selenium_100g 
##  Min.   :0e+00   Min.   :0      Min.   :0      Min.   :0     
##  1st Qu.:1e-04   1st Qu.:0      1st Qu.:0      1st Qu.:0     
##  Median :1e-04   Median :0      Median :0      Median :0     
##  Mean   :1e-04   Mean   :0      Mean   :0      Mean   :0     
##  3rd Qu.:1e-04   3rd Qu.:0      3rd Qu.:0      3rd Qu.:0     
##  Max.   :1e-04   Max.   :0      Max.   :0      Max.   :0     
##  NA's   :1498    NA's   :1499   NA's   :1498   NA's   :1499  
##  chromium_100g  molybdenum_100g  iodine_100g   caffeine_100g 
##  Mode:logical   Mode:logical    Min.   :0      Mode:logical  
##  NA's:1500      NA's:1500       1st Qu.:0      NA's:1500     
##                                 Median :0                    
##                                 Mean   :0                    
##                                 3rd Qu.:0                    
##                                 Max.   :0                    
##                                 NA's   :1499                 
##  taurine_100g   ph_100g        fruits_vegetables_nuts_100g
##  Mode:logical   Mode:logical   Min.   : 2.00              
##  NA's:1500      NA's:1500      1st Qu.:11.25              
##                                Median :42.00              
##                                Mean   :36.88              
##                                3rd Qu.:52.25              
##                                Max.   :80.00              
##                                NA's   :1470               
##  collagen_meat_protein_ratio_100g   cocoa_100g   chlorophyl_100g
##  Min.   :12.00                    Min.   :30     Mode:logical   
##  1st Qu.:13.50                    1st Qu.:47     NA's:1500      
##  Median :15.00                    Median :60                    
##  Mean   :15.67                    Mean   :57                    
##  3rd Qu.:17.50                    3rd Qu.:70                    
##  Max.   :20.00                    Max.   :81                    
##  NA's   :1497                     NA's   :1491                  
##  nutrition_score_fr_100g nutrition_score_uk_100g
##  Min.   :-12.000         Min.   :-12.000        
##  1st Qu.:  1.000         1st Qu.:  0.000        
##  Median :  7.000         Median :  6.000        
##  Mean   :  7.941         Mean   :  7.631        
##  3rd Qu.: 15.000         3rd Qu.: 16.000        
##  Max.   : 28.000         Max.   : 28.000        
##  NA's   :825             NA's   :825

Replacing missing values

# Find indices of sugar NA values: missing
missing <- is.na(food3$sugars_100g)

# Replace NA values with 0
food3$sugars_100g[missing] <- 0

# Create first histogram
hist(food3$sugars_100g, breaks=100)

# Create food4
food4 <- food3[food3$sugars_100g != 0, ]

# Create second histogram
hist(food4$sugars_100g, breaks=100)

Dealing with messy data

  • This exercise is attempting to estimate the number of food items in plastic packaging
  • going back to the original food dataframe here
    • packagin_tags column was removed in an earlier exercise. that maybe a mistake
# Find entries containing "plasti": plastic
plastic <- str_detect(food$packaging_tags,"plasti")

# Print the sum of plastic
sum(plastic)
## [1] 540

   


School Attendance Data


Importing the data

# Load the gdata package
library(gdata)

# Import the spreadsheet: att
url_att <- 'http://s3.amazonaws.com/assets.datacamp.com/production/course_1294/datasets/attendance.xls'
att <- read.xls(url_att)

Examining the data

# Print the column names 
names(att)
##  [1] "Table.43..Average.daily.attendance..ADA..as.a.percentage.of.total.enrollment..school.day.length..and.school.year.length.in.public.schools..by.school.level.and.state..2007.08"
##  [2] "X"                                                                                                                                                                            
##  [3] "X.1"                                                                                                                                                                          
##  [4] "X.2"                                                                                                                                                                          
##  [5] "X.3"                                                                                                                                                                          
##  [6] "X.4"                                                                                                                                                                          
##  [7] "X.5"                                                                                                                                                                          
##  [8] "X.6"                                                                                                                                                                          
##  [9] "X.7"                                                                                                                                                                          
## [10] "X.8"                                                                                                                                                                          
## [11] "X.9"                                                                                                                                                                          
## [12] "X.10"                                                                                                                                                                         
## [13] "X.11"                                                                                                                                                                         
## [14] "X.12"                                                                                                                                                                         
## [15] "X.13"                                                                                                                                                                         
## [16] "X.14"                                                                                                                                                                         
## [17] "X.15"
# Print the first 6 rows
head(att)
##   Table.43..Average.daily.attendance..ADA..as.a.percentage.of.total.enrollment..school.day.length..and.school.year.length.in.public.schools..by.school.level.and.state..2007.08
## 1                                                                                                                                                                              
## 2                                                                                                                                                                              
## 3                                                                                                                                                                             1
## 4                                                                                                                                                        United States ........
## 5                                                                                                                                                     Alabama .................
## 6                                                                                                                                                     Alaska ..................
##                                                                        X
## 1 Total elementary, secondary, and combined elementary/secondary schools
## 2                                           ADA as percent of enrollment
## 3                                                                      2
## 4                                                                   93.1
## 5                                                                   93.8
## 6                                                                   89.9
##      X.1                         X.2    X.3                         X.4
## 1                                                                      
## 2        Average hours in school day        Average days in school year
## 3                                  3                                  4
## 4 (0.22)                         6.6 (0.02)                         180
## 5 (1.24)                         7.0 (0.07)                         180
## 6 (1.22)                         6.5 (0.05)                         180
##     X.5                          X.6    X.7                          X.8
## 1                                                     Elementary schools
## 2       Average hours in school year        ADA as percent of enrollment
## 3                                  5                                   6
## 4 (0.1)                        1,193  (3.1)                         94.0
## 5 (0.8)                        1,267 (12.3)                         93.8
## 6 (3.4)                        1,163 (22.9)                         91.3
##      X.9                        X.10   X.11                         X.12
## 1                                                      Secondary schools
## 2        Average hours in school day        ADA as percent of enrollment
## 3                                  7                                   8
## 4 (0.27)                         6.7 (0.02)                         91.1
## 5 (1.84)                         7.0 (0.08)                         94.6
## 6 (1.56)                         6.5 (0.05)                         93.2
##     X.13                        X.14   X.15
## 1                                          
## 2        Average hours in school day       
## 3                                  9       
## 4 (0.43)                         6.6 (0.04)
## 5 (0.38)                         7.1 (0.17)
## 6 (1.57)                         6.2 (0.15)
# Print the last 6 rows
tail(att)
##                                                                            Table.43..Average.daily.attendance..ADA..as.a.percentage.of.total.enrollment..school.day.length..and.school.year.length.in.public.schools..by.school.level.and.state..2007.08
## 54                                                                                                                                                                                                                             Wisconsin ...............
## 55                                                                                                                                                                                                                             Wyoming .................
## 56                                                                                                                                                                                                                                      †Not applicable.
## 57                                                                                                                                                                                                       ‡Reporting standards not met (too few cases).  
## 58 NOTE: Averages reflect data reported by schools rather than state requirements. School-reported length of day may exceed state requirements, and there is a range of statistical error in reported estimates. Standard errors appear in parentheses. 
## 59                                                                                                     SOURCE: U.S. Department of Education, National Center for Education Statistics, Schools and Staffing Survey (SASS), \\Public School Questionnaire
##                                                               X    X.1 X.2
## 54                                                         95.0 (0.57) 6.9
## 55                                                         92.4 (1.15) 6.9
## 56                                                                        
## 57                                                                        
## 58                                                                        
## 59 \\ 2003-04 and 2007-08. (This table was prepared June 2011.)           
##       X.3 X.4   X.5   X.6   X.7  X.8    X.9 X.10   X.11 X.12   X.13 X.14
## 54 (0.04) 180 (0.7) 1,246 (8.6) 95.4 (0.41)  6.9 (0.05) 93.0 (1.91)  7.0
## 55 (0.05) 175 (1.3) 1,201 (8.3) 92.2 (1.65)  6.9 (0.05) 92.4 (0.75)  7.0
## 56                                                                      
## 57                                                                      
## 58                                                                      
## 59                                                                      
##      X.15
## 54 (0.14)
## 55 (0.07)
## 56       
## 57       
## 58       
## 59
# Print the structure
str(att)
## 'data.frame':    59 obs. of  17 variables:
##  $ Table.43..Average.daily.attendance..ADA..as.a.percentage.of.total.enrollment..school.day.length..and.school.year.length.in.public.schools..by.school.level.and.state..2007.08: Factor w/ 58 levels "","   United States ........",..: 1 1 5 2 6 7 8 9 10 11 ...
##  $ X                                                                                                                                                                            : Factor w/ 42 levels "","\\ 2003-04 and 2007-08. (This table was prepared June 2011.)",..: 42 41 3 22 28 8 6 14 23 29 ...
##  $ X.1                                                                                                                                                                          : Factor w/ 45 levels "","(0.22)","(0.23)",..: 1 1 1 2 22 21 41 27 14 6 ...
##  $ X.2                                                                                                                                                                          : Factor w/ 14 levels "","3","6.2","6.3",..: 1 14 2 7 11 6 5 10 3 11 ...
##  $ X.3                                                                                                                                                                          : Factor w/ 14 levels "","(0.02)","(0.03)",..: 1 1 1 2 7 5 9 6 7 5 ...
##  $ X.4                                                                                                                                                                          : Factor w/ 15 levels "","171","172",..: 1 15 14 10 10 10 11 9 11 2 ...
##  $ X.5                                                                                                                                                                          : Factor w/ 22 levels "","(0.0)","(0.1)",..: 1 1 1 3 10 21 19 4 6 12 ...
##  $ X.6                                                                                                                                                                          : Factor w/ 48 levels "","1,102","1,117",..: 1 48 47 26 45 15 13 36 5 28 ...
##  $ X.7                                                                                                                                                                          : Factor w/ 48 levels "","(10.1)","(10.3)",..: 1 1 1 36 11 35 22 6 13 48 ...
##  $ X.8                                                                                                                                                                          : Factor w/ 40 levels "","6","81.0",..: 40 39 2 22 20 9 5 13 27 25 ...
##  $ X.9                                                                                                                                                                          : Factor w/ 51 levels "","(0.24)","(0.25)",..: 1 1 1 4 32 25 48 35 20 13 ...
##  $ X.10                                                                                                                                                                         : Factor w/ 14 levels "","6.2","6.3",..: 1 14 10 7 11 5 4 9 3 11 ...
##  $ X.11                                                                                                                                                                         : Factor w/ 13 levels "","(0.02)","(0.03)",..: 1 1 1 2 8 5 10 8 5 7 ...
##  $ X.12                                                                                                                                                                         : Factor w/ 41 levels "","‡","8","85.8",..: 41 40 3 19 35 30 8 18 9 20 ...
##  $ X.13                                                                                                                                                                         : Factor w/ 47 levels "","(†)","(0.35)",..: 1 1 1 7 5 25 43 35 23 20 ...
##  $ X.14                                                                                                                                                                         : Factor w/ 17 levels "","‡","5.9","6.1",..: 1 17 16 9 14 5 7 11 4 13 ...
##  $ X.15                                                                                                                                                                         : Factor w/ 29 levels "","(†)","(0.03)",..: 1 1 1 4 15 13 21 8 18 9 ...

Removing unnecessary rows

# Create remove
remove <- c(3,56:59)

# Create att2
att2 <- att[-remove,]

Removing useless columns

# Create remove
remove <- seq(3,17,2)

# Create att3
att3 <- att2[,-remove]

Splitting the data

head(att3)
##   Table.43..Average.daily.attendance..ADA..as.a.percentage.of.total.enrollment..school.day.length..and.school.year.length.in.public.schools..by.school.level.and.state..2007.08
## 1                                                                                                                                                                              
## 2                                                                                                                                                                              
## 4                                                                                                                                                        United States ........
## 5                                                                                                                                                     Alabama .................
## 6                                                                                                                                                     Alaska ..................
## 7                                                                                                                                                     Arizona .................
##                                                                        X
## 1 Total elementary, secondary, and combined elementary/secondary schools
## 2                                           ADA as percent of enrollment
## 4                                                                   93.1
## 5                                                                   93.8
## 6                                                                   89.9
## 7                                                                   89.0
##                           X.2                         X.4
## 1                                                        
## 2 Average hours in school day Average days in school year
## 4                         6.6                         180
## 5                         7.0                         180
## 6                         6.5                         180
## 7                         6.4                         181
##                            X.6                          X.8
## 1                                        Elementary schools
## 2 Average hours in school year ADA as percent of enrollment
## 4                        1,193                         94.0
## 5                        1,267                         93.8
## 6                        1,163                         91.3
## 7                        1,159                         88.9
##                          X.10                         X.12
## 1                                        Secondary schools
## 2 Average hours in school day ADA as percent of enrollment
## 4                         6.7                         91.1
## 5                         7.0                         94.6
## 6                         6.5                         93.2
## 7                         6.4                         89.0
##                          X.14
## 1                            
## 2 Average hours in school day
## 4                         6.6
## 5                         7.1
## 6                         6.2
## 7                         6.4
# Subset just elementary schools: att_elem
att_elem <- att3[,c(1,6,7)]
head(att_elem)
##   Table.43..Average.daily.attendance..ADA..as.a.percentage.of.total.enrollment..school.day.length..and.school.year.length.in.public.schools..by.school.level.and.state..2007.08
## 1                                                                                                                                                                              
## 2                                                                                                                                                                              
## 4                                                                                                                                                        United States ........
## 5                                                                                                                                                     Alabama .................
## 6                                                                                                                                                     Alaska ..................
## 7                                                                                                                                                     Arizona .................
##                            X.8                        X.10
## 1           Elementary schools                            
## 2 ADA as percent of enrollment Average hours in school day
## 4                         94.0                         6.7
## 5                         93.8                         7.0
## 6                         91.3                         6.5
## 7                         88.9                         6.4
# Subset just secondary schools: att_sec
att_sec <- att3[,c(1,8,9)]
head(att_sec)
##   Table.43..Average.daily.attendance..ADA..as.a.percentage.of.total.enrollment..school.day.length..and.school.year.length.in.public.schools..by.school.level.and.state..2007.08
## 1                                                                                                                                                                              
## 2                                                                                                                                                                              
## 4                                                                                                                                                        United States ........
## 5                                                                                                                                                     Alabama .................
## 6                                                                                                                                                     Alaska ..................
## 7                                                                                                                                                     Arizona .................
##                           X.12                        X.14
## 1            Secondary schools                            
## 2 ADA as percent of enrollment Average hours in school day
## 4                         91.1                         6.6
## 5                         94.6                         7.1
## 6                         93.2                         6.2
## 7                         89.0                         6.4
# Subset all schools: att4
att4 <- att3[,1:5]
head(att4)
##   Table.43..Average.daily.attendance..ADA..as.a.percentage.of.total.enrollment..school.day.length..and.school.year.length.in.public.schools..by.school.level.and.state..2007.08
## 1                                                                                                                                                                              
## 2                                                                                                                                                                              
## 4                                                                                                                                                        United States ........
## 5                                                                                                                                                     Alabama .................
## 6                                                                                                                                                     Alaska ..................
## 7                                                                                                                                                     Arizona .................
##                                                                        X
## 1 Total elementary, secondary, and combined elementary/secondary schools
## 2                                           ADA as percent of enrollment
## 4                                                                   93.1
## 5                                                                   93.8
## 6                                                                   89.9
## 7                                                                   89.0
##                           X.2                         X.4
## 1                                                        
## 2 Average hours in school day Average days in school year
## 4                         6.6                         180
## 5                         7.0                         180
## 6                         6.5                         180
## 7                         6.4                         181
##                            X.6
## 1                             
## 2 Average hours in school year
## 4                        1,193
## 5                        1,267
## 6                        1,163
## 7                        1,159

Replacing the names

# Define cnames vector (don't change)
cnames <- c("state", "avg_attend_pct", "avg_hr_per_day", 
            "avg_day_per_yr", "avg_hr_per_yr")

# Assign column names of att4
colnames(att4) <- cnames
head(att4)
##                       state
## 1                          
## 2                          
## 4    United States ........
## 5 Alabama .................
## 6 Alaska ..................
## 7 Arizona .................
##                                                           avg_attend_pct
## 1 Total elementary, secondary, and combined elementary/secondary schools
## 2                                           ADA as percent of enrollment
## 4                                                                   93.1
## 5                                                                   93.8
## 6                                                                   89.9
## 7                                                                   89.0
##                avg_hr_per_day              avg_day_per_yr
## 1                                                        
## 2 Average hours in school day Average days in school year
## 4                         6.6                         180
## 5                         7.0                         180
## 6                         6.5                         180
## 7                         6.4                         181
##                  avg_hr_per_yr
## 1                             
## 2 Average hours in school year
## 4                        1,193
## 5                        1,267
## 6                        1,163
## 7                        1,159
# Remove first two rows of att4: att5
att5 <- att4[-c(1,2),]

# View the names of att5
names(att5)
## [1] "state"          "avg_attend_pct" "avg_hr_per_day" "avg_day_per_yr"
## [5] "avg_hr_per_yr"
head(att5)
##                       state avg_attend_pct avg_hr_per_day avg_day_per_yr
## 4    United States ........           93.1            6.6            180
## 5 Alabama .................           93.8            7.0            180
## 6 Alaska ..................           89.9            6.5            180
## 7 Arizona .................           89.0            6.4            181
## 8 Arkansas ................           91.8            6.9            179
## 9 California ..............           93.2            6.2            181
##   avg_hr_per_yr
## 4         1,193
## 5         1,267
## 6         1,163
## 7         1,159
## 8         1,229
## 9         1,129

Cleaning up extra characters

## stringr and att5 are pre-loaded
head(att5)
##                       state avg_attend_pct avg_hr_per_day avg_day_per_yr
## 4    United States ........           93.1            6.6            180
## 5 Alabama .................           93.8            7.0            180
## 6 Alaska ..................           89.9            6.5            180
## 7 Arizona .................           89.0            6.4            181
## 8 Arkansas ................           91.8            6.9            179
## 9 California ..............           93.2            6.2            181
##   avg_hr_per_yr
## 4         1,193
## 5         1,267
## 6         1,163
## 7         1,159
## 8         1,229
## 9         1,129
# Remove all periods in state column
att5$state <- str_replace_all(att5$state, "\\.","")
head(att5)
##               state avg_attend_pct avg_hr_per_day avg_day_per_yr
## 4    United States            93.1            6.6            180
## 5          Alabama            93.8            7.0            180
## 6           Alaska            89.9            6.5            180
## 7          Arizona            89.0            6.4            181
## 8         Arkansas            91.8            6.9            179
## 9       California            93.2            6.2            181
##   avg_hr_per_yr
## 4         1,193
## 5         1,267
## 6         1,163
## 7         1,159
## 8         1,229
## 9         1,129
# Remove white space around state names
att5$state <- str_trim(att5$state)

# View the head of att5
head(att5)
##           state avg_attend_pct avg_hr_per_day avg_day_per_yr avg_hr_per_yr
## 4 United States           93.1            6.6            180         1,193
## 5       Alabama           93.8            7.0            180         1,267
## 6        Alaska           89.9            6.5            180         1,163
## 7       Arizona           89.0            6.4            181         1,159
## 8      Arkansas           91.8            6.9            179         1,229
## 9    California           93.2            6.2            181         1,129

Some final type conversions

  • dplyr class comes later so this part is just an example here
  • This is how you should do it but the not dplyr (old) way is to use sapply
# Change columns to numeric using dplyr (don't change)
library(dplyr)
example <- mutate_each(att5, funs(as.numeric), -state)
str(example)
## 'data.frame':    52 obs. of  5 variables:
##  $ state         : chr  "United States" "Alabama" "Alaska" "Arizona" ...
##  $ avg_attend_pct: num  22 28 8 6 14 23 29 5 7 11 ...
##  $ avg_hr_per_day: num  7 11 6 5 10 3 11 6 8 10 ...
##  $ avg_day_per_yr: num  10 10 10 11 9 11 2 11 11 11 ...
##  $ avg_hr_per_yr : num  26 45 15 13 36 5 28 19 31 42 ...
# Define vector containing numerical columns: cols
cols <- c(2:5)

# Use sapply to coerce cols to numeric
att5[, cols] <- sapply(att5[,cols], as.numeric)
str(att5)
## 'data.frame':    52 obs. of  5 variables:
##  $ state         : chr  "United States" "Alabama" "Alaska" "Arizona" ...
##  $ avg_attend_pct: num  22 28 8 6 14 23 29 5 7 11 ...
##  $ avg_hr_per_day: num  7 11 6 5 10 3 11 6 8 10 ...
##  $ avg_day_per_yr: num  10 10 10 11 9 11 2 11 11 11 ...
##  $ avg_hr_per_yr : num  26 45 15 13 36 5 28 19 31 42 ...