Course notes for Importing and Cleaning Data in R: Case Studies
# Import sales.csv: sales
url_sales <- 'http://s3.amazonaws.com/assets.datacamp.com/production/course_1294/datasets/sales.csv'
sales <- read.csv(url_sales, stringsAsFactors=F)
# View dimensions of sales
dim(sales)
## [1] 5000 46
# Inspect first 6 rows of sales
head(sales)
## X event_id primary_act_id secondary_act_id
## 1 1 abcaf1adb99a935fc661 43f0436b905bfa7c2eec b85143bf51323b72e53c
## 2 2 6c56d7f08c95f2aa453c 1a3e9aecd0617706a794 f53529c5679ea6ca5a48
## 3 3 c7ab4524a121f9d687d2 4b677c3f5bec71eec8d1 b85143bf51323b72e53c
## 4 4 394cb493f893be9b9ed1 b1ccea01ad6ef8522796 b85143bf51323b72e53c
## 5 5 55b5f67e618557929f48 91c03a34b562436efa3c b85143bf51323b72e53c
## 6 6 4f10fd8b9f550352bd56 ac4b847b3fde66f2117e 63814f3d63317f1b56c4
## purch_party_lkup_id
## 1 7dfa56dd7d5956b17587
## 2 4f9e6fc637eaf7b736c2
## 3 6c2545703bd527a7144d
## 4 527d6b1eaffc69ddd882
## 5 8bd62c394a35213bdf52
## 6 3b3a628f83135acd0676
## event_name
## 1 Xfinity Center Mansfield Premier Parking: Florida Georgia Line
## 2 Gorge Camping - dave matthews band - sept 3-7
## 3 Dodge Theatre Adams Street Parking - benise
## 4 Gexa Energy Pavilion Vip Parking : kid rock with sheryl crow
## 5 Premier Parking - motley crue
## 6 Fast Lane Access: Journey
## primary_act_name secondary_act_name
## 1 XFINITY Center Mansfield Premier Parking NULL
## 2 Gorge Camping Dave Matthews Band
## 3 Parking Event NULL
## 4 Gexa Energy Pavilion VIP Parking NULL
## 5 White River Amphitheatre Premier Parking NULL
## 6 Fast Lane Access Journey
## major_cat_name minor_cat_name la_event_type_cat
## 1 MISC PARKING PARKING
## 2 MISC CAMPING INVALID
## 3 MISC PARKING PARKING
## 4 MISC PARKING PARKING
## 5 MISC PARKING PARKING
## 6 MISC SPECIAL ENTRY (UPSELL) UPSELL
## event_disp_name
## 1 Xfinity Center Mansfield Premier Parking: Florida Georgia Line
## 2 Gorge Camping - dave matthews band - sept 3-7
## 3 Dodge Theatre Adams Street Parking - benise
## 4 Gexa Energy Pavilion Vip Parking : kid rock with sheryl crow
## 5 Premier Parking - motley crue
## 6 Fast Lane Access: Journey
## ticket_text
## 1 THIS TICKET IS VALID FOR PARKING ONLY GOOD THIS DAY ONLY PREMIER PARKING PASS XFINITY CENTER,LOTS 4 PM SAT SEP 12 2015 7:30 PM
## 2 %OVERNIGHT C A M P I N G%* * * * * *%GORGE CAMPGROUND%* GOOD THIS DATE ONLY *%SEP 3 - 6, 2009
## 3 ADAMS STREET GARAGE%PARKING FOR 4/21/06 ONLY%DODGE THEATRE PARKING PASS%ENTRANCE ON ADAMS STREET%BENISE%GARAGE OPENS AT 6:00PM
## 4 THIS TICKET IS VALID FOR PARKING ONLY GOOD FOR THIS DATE ONLY VIP PARKING PASS GEXA ENERGY PAVILION FRI SEP 02 2011 7:00 PM
## 5 THIS TICKET IS VALID%FOR PARKING ONLY%GOOD THIS DATE ONLY%PREMIER PARKING PASS%WHITE RIVER AMPHITHEATRE%SAT JUL 30, 2005 6:00PM
## 6 FAST LANE JOURNEY FAST LANE EVENT THIS IS NOT A TICKET SAN MANUEL AMPHITHEATER SAT JUL 21 2012 7:00 PM
## tickets_purchased_qty trans_face_val_amt delivery_type_cd
## 1 1 45 eTicket
## 2 1 75 TicketFast
## 3 1 5 TicketFast
## 4 1 20 Mail
## 5 1 20 Mail
## 6 2 10 TicketFast
## event_date_time event_dt presale_dt onsale_dt
## 1 2015-09-12 23:30:00 2015-09-12 NULL 2015-05-15
## 2 2009-09-05 01:00:00 2009-09-04 NULL 2009-03-13
## 3 2006-04-22 01:30:00 2006-04-21 NULL 2006-02-25
## 4 2011-09-03 00:00:00 2011-09-02 NULL 2011-04-22
## 5 2005-07-31 01:00:00 2005-07-30 2005-03-02 2005-03-04
## 6 2012-07-22 02:00:00 2012-07-21 NULL 2012-04-11
## sales_ord_create_dttm sales_ord_tran_dt print_dt timezn_nm
## 1 2015-09-11 18:17:45 2015-09-11 2015-09-12 EST
## 2 2009-07-06 00:00:00 2009-07-05 2009-09-01 PST
## 3 2006-04-05 00:00:00 2006-04-05 2006-04-05 MST
## 4 2011-07-01 17:38:50 2011-07-01 2011-07-06 CST
## 5 2005-06-18 00:00:00 2005-06-18 2005-06-28 PST
## 6 2012-07-21 17:20:18 2012-07-21 2012-07-21 PST
## venue_city venue_state venue_postal_cd_sgmt_1
## 1 MANSFIELD MASSACHUSETTS 02048
## 2 QUINCY WASHINGTON 98848
## 3 PHOENIX ARIZONA 85003
## 4 DALLAS TEXAS 75210
## 5 AUBURN WASHINGTON 98092
## 6 SAN BERNARDINO CALIFORNIA 92407
## sales_platform_cd print_flg la_valid_tkt_event_flg fin_mkt_nm
## 1 www.concerts.livenation.com T N Boston
## 2 NULL T N Seattle
## 3 NULL T N Arizona
## 4 NULL T N Dallas
## 5 NULL T N Seattle
## 6 www.livenation.com T N Los Angeles
## web_session_cookie_val gndr_cd age_yr income_amt edu_val
## 1 7dfa56dd7d5956b17587 <NA> <NA> <NA> <NA>
## 2 4f9e6fc637eaf7b736c2 <NA> <NA> <NA> <NA>
## 3 6c2545703bd527a7144d <NA> <NA> <NA> <NA>
## 4 527d6b1eaffc69ddd882 <NA> <NA> <NA> <NA>
## 5 8bd62c394a35213bdf52 <NA> <NA> <NA> <NA>
## 6 3b3a628f83135acd0676 <NA> <NA> <NA> <NA>
## edu_1st_indv_val edu_2nd_indv_val adults_in_hh_num married_ind
## 1 <NA> <NA> <NA> <NA>
## 2 <NA> <NA> <NA> <NA>
## 3 <NA> <NA> <NA> <NA>
## 4 <NA> <NA> <NA> <NA>
## 5 <NA> <NA> <NA> <NA>
## 6 <NA> <NA> <NA> <NA>
## child_present_ind home_owner_ind occpn_val occpn_1st_val occpn_2nd_val
## 1 <NA> <NA> <NA> <NA> <NA>
## 2 <NA> <NA> <NA> <NA> <NA>
## 3 <NA> <NA> <NA> <NA> <NA>
## 4 <NA> <NA> <NA> <NA> <NA>
## 5 <NA> <NA> <NA> <NA> <NA>
## 6 <NA> <NA> <NA> <NA> <NA>
## dist_to_ven
## 1 NA
## 2 59
## 3 NA
## 4 NA
## 5 NA
## 6 NA
# View column names of sales
names(sales)
## [1] "X" "event_id"
## [3] "primary_act_id" "secondary_act_id"
## [5] "purch_party_lkup_id" "event_name"
## [7] "primary_act_name" "secondary_act_name"
## [9] "major_cat_name" "minor_cat_name"
## [11] "la_event_type_cat" "event_disp_name"
## [13] "ticket_text" "tickets_purchased_qty"
## [15] "trans_face_val_amt" "delivery_type_cd"
## [17] "event_date_time" "event_dt"
## [19] "presale_dt" "onsale_dt"
## [21] "sales_ord_create_dttm" "sales_ord_tran_dt"
## [23] "print_dt" "timezn_nm"
## [25] "venue_city" "venue_state"
## [27] "venue_postal_cd_sgmt_1" "sales_platform_cd"
## [29] "print_flg" "la_valid_tkt_event_flg"
## [31] "fin_mkt_nm" "web_session_cookie_val"
## [33] "gndr_cd" "age_yr"
## [35] "income_amt" "edu_val"
## [37] "edu_1st_indv_val" "edu_2nd_indv_val"
## [39] "adults_in_hh_num" "married_ind"
## [41] "child_present_ind" "home_owner_ind"
## [43] "occpn_val" "occpn_1st_val"
## [45] "occpn_2nd_val" "dist_to_ven"
# Look at structure of sales
str(sales)
## 'data.frame': 5000 obs. of 46 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ event_id : chr "abcaf1adb99a935fc661" "6c56d7f08c95f2aa453c" "c7ab4524a121f9d687d2" "394cb493f893be9b9ed1" ...
## $ primary_act_id : chr "43f0436b905bfa7c2eec" "1a3e9aecd0617706a794" "4b677c3f5bec71eec8d1" "b1ccea01ad6ef8522796" ...
## $ secondary_act_id : chr "b85143bf51323b72e53c" "f53529c5679ea6ca5a48" "b85143bf51323b72e53c" "b85143bf51323b72e53c" ...
## $ purch_party_lkup_id : chr "7dfa56dd7d5956b17587" "4f9e6fc637eaf7b736c2" "6c2545703bd527a7144d" "527d6b1eaffc69ddd882" ...
## $ event_name : chr "Xfinity Center Mansfield Premier Parking: Florida Georgia Line" "Gorge Camping - dave matthews band - sept 3-7" "Dodge Theatre Adams Street Parking - benise" "Gexa Energy Pavilion Vip Parking : kid rock with sheryl crow" ...
## $ primary_act_name : chr "XFINITY Center Mansfield Premier Parking" "Gorge Camping" "Parking Event" "Gexa Energy Pavilion VIP Parking" ...
## $ secondary_act_name : chr "NULL" "Dave Matthews Band" "NULL" "NULL" ...
## $ major_cat_name : chr "MISC" "MISC" "MISC" "MISC" ...
## $ minor_cat_name : chr "PARKING" "CAMPING" "PARKING" "PARKING" ...
## $ la_event_type_cat : chr "PARKING" "INVALID" "PARKING" "PARKING" ...
## $ event_disp_name : chr "Xfinity Center Mansfield Premier Parking: Florida Georgia Line" "Gorge Camping - dave matthews band - sept 3-7" "Dodge Theatre Adams Street Parking - benise" "Gexa Energy Pavilion Vip Parking : kid rock with sheryl crow" ...
## $ ticket_text : chr " THIS TICKET IS VALID FOR PARKING ONLY GOOD THIS DAY ONLY PREMIER PARKING PASS XFINIT"| __truncated__ "%OVERNIGHT C A M P I N G%* * * * * *%GORGE CAMPGROUND%* GOOD THIS DATE ONLY *%SEP 3 - 6, 2009" "ADAMS STREET GARAGE%PARKING FOR 4/21/06 ONLY%DODGE THEATRE PARKING PASS%ENTRANCE ON ADAMS STREET%BENISE%GARAGE OPENS AT 6:00PM" " THIS TICKET IS VALID FOR PARKING ONLY GOOD FOR THIS DATE ONLY VIP PARKING PASS GEXA"| __truncated__ ...
## $ tickets_purchased_qty : int 1 1 1 1 1 2 1 1 1 1 ...
## $ trans_face_val_amt : num 45 75 5 20 20 10 30 28 20 25 ...
## $ delivery_type_cd : chr "eTicket" "TicketFast" "TicketFast" "Mail" ...
## $ event_date_time : chr "2015-09-12 23:30:00" "2009-09-05 01:00:00" "2006-04-22 01:30:00" "2011-09-03 00:00:00" ...
## $ event_dt : chr "2015-09-12" "2009-09-04" "2006-04-21" "2011-09-02" ...
## $ presale_dt : chr "NULL" "NULL" "NULL" "NULL" ...
## $ onsale_dt : chr "2015-05-15" "2009-03-13" "2006-02-25" "2011-04-22" ...
## $ sales_ord_create_dttm : chr "2015-09-11 18:17:45" "2009-07-06 00:00:00" "2006-04-05 00:00:00" "2011-07-01 17:38:50" ...
## $ sales_ord_tran_dt : chr "2015-09-11" "2009-07-05" "2006-04-05" "2011-07-01" ...
## $ print_dt : chr "2015-09-12" "2009-09-01" "2006-04-05" "2011-07-06" ...
## $ timezn_nm : chr "EST" "PST" "MST" "CST" ...
## $ venue_city : chr "MANSFIELD" "QUINCY" "PHOENIX" "DALLAS" ...
## $ venue_state : chr "MASSACHUSETTS" "WASHINGTON" "ARIZONA" "TEXAS" ...
## $ venue_postal_cd_sgmt_1: chr "02048" "98848" "85003" "75210" ...
## $ sales_platform_cd : chr "www.concerts.livenation.com" "NULL" "NULL" "NULL" ...
## $ print_flg : chr "T " "T " "T " "T " ...
## $ la_valid_tkt_event_flg: chr "N " "N " "N " "N " ...
## $ fin_mkt_nm : chr "Boston" "Seattle" "Arizona" "Dallas" ...
## $ web_session_cookie_val: chr "7dfa56dd7d5956b17587" "4f9e6fc637eaf7b736c2" "6c2545703bd527a7144d" "527d6b1eaffc69ddd882" ...
## $ gndr_cd : chr NA NA NA NA ...
## $ age_yr : chr NA NA NA NA ...
## $ income_amt : chr NA NA NA NA ...
## $ edu_val : chr NA NA NA NA ...
## $ edu_1st_indv_val : chr NA NA NA NA ...
## $ edu_2nd_indv_val : chr NA NA NA NA ...
## $ adults_in_hh_num : chr NA NA NA NA ...
## $ married_ind : chr NA NA NA NA ...
## $ child_present_ind : chr NA NA NA NA ...
## $ home_owner_ind : chr NA NA NA NA ...
## $ occpn_val : chr NA NA NA NA ...
## $ occpn_1st_val : chr NA NA NA NA ...
## $ occpn_2nd_val : chr NA NA NA NA ...
## $ dist_to_ven : int NA 59 NA NA NA NA NA NA NA NA ...
# View a summary of sales
summary(sales)
## X event_id primary_act_id secondary_act_id
## Min. : 1 Length:5000 Length:5000 Length:5000
## 1st Qu.:1251 Class :character Class :character Class :character
## Median :2500 Mode :character Mode :character Mode :character
## Mean :2500
## 3rd Qu.:3750
## Max. :5000
##
## purch_party_lkup_id event_name primary_act_name
## Length:5000 Length:5000 Length:5000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## secondary_act_name major_cat_name minor_cat_name
## Length:5000 Length:5000 Length:5000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## la_event_type_cat event_disp_name ticket_text
## Length:5000 Length:5000 Length:5000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## tickets_purchased_qty trans_face_val_amt delivery_type_cd
## Min. :1.000 Min. : 1.00 Length:5000
## 1st Qu.:1.000 1st Qu.: 20.00 Class :character
## Median :1.000 Median : 30.00 Mode :character
## Mean :1.639 Mean : 77.08
## 3rd Qu.:2.000 3rd Qu.: 85.00
## Max. :8.000 Max. :1520.88
##
## event_date_time event_dt presale_dt
## Length:5000 Length:5000 Length:5000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## onsale_dt sales_ord_create_dttm sales_ord_tran_dt
## Length:5000 Length:5000 Length:5000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## print_dt timezn_nm venue_city
## Length:5000 Length:5000 Length:5000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## venue_state venue_postal_cd_sgmt_1 sales_platform_cd
## Length:5000 Length:5000 Length:5000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## print_flg la_valid_tkt_event_flg fin_mkt_nm
## Length:5000 Length:5000 Length:5000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## web_session_cookie_val gndr_cd age_yr
## Length:5000 Length:5000 Length:5000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## income_amt edu_val edu_1st_indv_val
## Length:5000 Length:5000 Length:5000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## edu_2nd_indv_val adults_in_hh_num married_ind
## Length:5000 Length:5000 Length:5000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## child_present_ind home_owner_ind occpn_val
## Length:5000 Length:5000 Length:5000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## occpn_1st_val occpn_2nd_val dist_to_ven
## Length:5000 Length:5000 Min. : 0.0
## Class :character Class :character 1st Qu.: 12.0
## Mode :character Mode :character Median : 26.0
## Mean : 158.2
## 3rd Qu.: 77.5
## Max. :2548.0
## NA's :4677
# Load dplyr
library(dplyr)
# Get a glimpse of sales
glimpse(sales)
## Observations: 5,000
## Variables: 46
## $ X <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ...
## $ event_id <chr> "abcaf1adb99a935fc661", "6c56d7f08c95f2...
## $ primary_act_id <chr> "43f0436b905bfa7c2eec", "1a3e9aecd06177...
## $ secondary_act_id <chr> "b85143bf51323b72e53c", "f53529c5679ea6...
## $ purch_party_lkup_id <chr> "7dfa56dd7d5956b17587", "4f9e6fc637eaf7...
## $ event_name <chr> "Xfinity Center Mansfield Premier Parki...
## $ primary_act_name <chr> "XFINITY Center Mansfield Premier Parki...
## $ secondary_act_name <chr> "NULL", "Dave Matthews Band", "NULL", "...
## $ major_cat_name <chr> "MISC", "MISC", "MISC", "MISC", "MISC",...
## $ minor_cat_name <chr> "PARKING", "CAMPING", "PARKING", "PARKI...
## $ la_event_type_cat <chr> "PARKING", "INVALID", "PARKING", "PARKI...
## $ event_disp_name <chr> "Xfinity Center Mansfield Premier Parki...
## $ ticket_text <chr> " THIS TICKET IS VALID FOR PAR...
## $ tickets_purchased_qty <int> 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 4, ...
## $ trans_face_val_amt <dbl> 45, 75, 5, 20, 20, 10, 30, 28, 20, 25, ...
## $ delivery_type_cd <chr> "eTicket", "TicketFast", "TicketFast", ...
## $ event_date_time <chr> "2015-09-12 23:30:00", "2009-09-05 01:0...
## $ event_dt <chr> "2015-09-12", "2009-09-04", "2006-04-21...
## $ presale_dt <chr> "NULL", "NULL", "NULL", "NULL", "2005-0...
## $ onsale_dt <chr> "2015-05-15", "2009-03-13", "2006-02-25...
## $ sales_ord_create_dttm <chr> "2015-09-11 18:17:45", "2009-07-06 00:0...
## $ sales_ord_tran_dt <chr> "2015-09-11", "2009-07-05", "2006-04-05...
## $ print_dt <chr> "2015-09-12", "2009-09-01", "2006-04-05...
## $ timezn_nm <chr> "EST", "PST", "MST", "CST", "PST", "PST...
## $ venue_city <chr> "MANSFIELD", "QUINCY", "PHOENIX", "DALL...
## $ venue_state <chr> "MASSACHUSETTS", "WASHINGTON", "ARIZONA...
## $ venue_postal_cd_sgmt_1 <chr> "02048", "98848", "85003", "75210", "98...
## $ sales_platform_cd <chr> "www.concerts.livenation.com", "NULL", ...
## $ print_flg <chr> "T ", "T ", "T ", "T ", "T ", "T ", "T ...
## $ la_valid_tkt_event_flg <chr> "N ", "N ", "N ", "N ", "N ", "N ", "N ...
## $ fin_mkt_nm <chr> "Boston", "Seattle", "Arizona", "Dallas...
## $ web_session_cookie_val <chr> "7dfa56dd7d5956b17587", "4f9e6fc637eaf7...
## $ gndr_cd <chr> NA, NA, NA, NA, NA, NA, "M", NA, NA, NA...
## $ age_yr <chr> NA, NA, NA, NA, NA, NA, "28", NA, NA, N...
## $ income_amt <chr> NA, NA, NA, NA, NA, NA, "112500", NA, N...
## $ edu_val <chr> NA, NA, NA, NA, NA, NA, "High School", ...
## $ edu_1st_indv_val <chr> NA, NA, NA, NA, NA, NA, "High School", ...
## $ edu_2nd_indv_val <chr> NA, NA, NA, NA, NA, NA, "NULL", NA, NA,...
## $ adults_in_hh_num <chr> NA, NA, NA, NA, NA, NA, "4", NA, NA, NA...
## $ married_ind <chr> NA, NA, NA, NA, NA, NA, "0", NA, NA, NA...
## $ child_present_ind <chr> NA, NA, NA, NA, NA, NA, "1", NA, NA, NA...
## $ home_owner_ind <chr> NA, NA, NA, NA, NA, NA, "0", NA, NA, NA...
## $ occpn_val <chr> NA, NA, NA, NA, NA, NA, "NULL", NA, NA,...
## $ occpn_1st_val <chr> NA, NA, NA, NA, NA, NA, "Craftsman Blue...
## $ occpn_2nd_val <chr> NA, NA, NA, NA, NA, NA, "NULL", NA, NA,...
## $ dist_to_ven <int> NA, 59, NA, NA, NA, NA, NA, NA, NA, NA,...
# Remove the first column of sales: sales2
sales2 <- sales[,-1]
str(sales[,1:5])
## 'data.frame': 5000 obs. of 5 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ event_id : chr "abcaf1adb99a935fc661" "6c56d7f08c95f2aa453c" "c7ab4524a121f9d687d2" "394cb493f893be9b9ed1" ...
## $ primary_act_id : chr "43f0436b905bfa7c2eec" "1a3e9aecd0617706a794" "4b677c3f5bec71eec8d1" "b1ccea01ad6ef8522796" ...
## $ secondary_act_id : chr "b85143bf51323b72e53c" "f53529c5679ea6ca5a48" "b85143bf51323b72e53c" "b85143bf51323b72e53c" ...
## $ purch_party_lkup_id: chr "7dfa56dd7d5956b17587" "4f9e6fc637eaf7b736c2" "6c2545703bd527a7144d" "527d6b1eaffc69ddd882" ...
str(sales2[,1:5])
## 'data.frame': 5000 obs. of 5 variables:
## $ event_id : chr "abcaf1adb99a935fc661" "6c56d7f08c95f2aa453c" "c7ab4524a121f9d687d2" "394cb493f893be9b9ed1" ...
## $ primary_act_id : chr "43f0436b905bfa7c2eec" "1a3e9aecd0617706a794" "4b677c3f5bec71eec8d1" "b1ccea01ad6ef8522796" ...
## $ secondary_act_id : chr "b85143bf51323b72e53c" "f53529c5679ea6ca5a48" "b85143bf51323b72e53c" "b85143bf51323b72e53c" ...
## $ purch_party_lkup_id: chr "7dfa56dd7d5956b17587" "4f9e6fc637eaf7b736c2" "6c2545703bd527a7144d" "527d6b1eaffc69ddd882" ...
## $ event_name : chr "Xfinity Center Mansfield Premier Parking: Florida Georgia Line" "Gorge Camping - dave matthews band - sept 3-7" "Dodge Theatre Adams Street Parking - benise" "Gexa Energy Pavilion Vip Parking : kid rock with sheryl crow" ...
# Define a vector of column indices: keep
## We don't want the first 4 coumns or the last 15
keep <- seq(5,ncol(sales2)-15,1)
# Subset sales2 using keep: sales3
sales3 <- sales2[,keep]
glimpse(sales3)
## Observations: 5,000
## Variables: 26
## $ event_name <chr> "Xfinity Center Mansfield Premier Parki...
## $ primary_act_name <chr> "XFINITY Center Mansfield Premier Parki...
## $ secondary_act_name <chr> "NULL", "Dave Matthews Band", "NULL", "...
## $ major_cat_name <chr> "MISC", "MISC", "MISC", "MISC", "MISC",...
## $ minor_cat_name <chr> "PARKING", "CAMPING", "PARKING", "PARKI...
## $ la_event_type_cat <chr> "PARKING", "INVALID", "PARKING", "PARKI...
## $ event_disp_name <chr> "Xfinity Center Mansfield Premier Parki...
## $ ticket_text <chr> " THIS TICKET IS VALID FOR PAR...
## $ tickets_purchased_qty <int> 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 4, ...
## $ trans_face_val_amt <dbl> 45, 75, 5, 20, 20, 10, 30, 28, 20, 25, ...
## $ delivery_type_cd <chr> "eTicket", "TicketFast", "TicketFast", ...
## $ event_date_time <chr> "2015-09-12 23:30:00", "2009-09-05 01:0...
## $ event_dt <chr> "2015-09-12", "2009-09-04", "2006-04-21...
## $ presale_dt <chr> "NULL", "NULL", "NULL", "NULL", "2005-0...
## $ onsale_dt <chr> "2015-05-15", "2009-03-13", "2006-02-25...
## $ sales_ord_create_dttm <chr> "2015-09-11 18:17:45", "2009-07-06 00:0...
## $ sales_ord_tran_dt <chr> "2015-09-11", "2009-07-05", "2006-04-05...
## $ print_dt <chr> "2015-09-12", "2009-09-01", "2006-04-05...
## $ timezn_nm <chr> "EST", "PST", "MST", "CST", "PST", "PST...
## $ venue_city <chr> "MANSFIELD", "QUINCY", "PHOENIX", "DALL...
## $ venue_state <chr> "MASSACHUSETTS", "WASHINGTON", "ARIZONA...
## $ venue_postal_cd_sgmt_1 <chr> "02048", "98848", "85003", "75210", "98...
## $ sales_platform_cd <chr> "www.concerts.livenation.com", "NULL", ...
## $ print_flg <chr> "T ", "T ", "T ", "T ", "T ", "T ", "T ...
## $ la_valid_tkt_event_flg <chr> "N ", "N ", "N ", "N ", "N ", "N ", "N ...
## $ fin_mkt_nm <chr> "Boston", "Seattle", "Arizona", "Dallas...
# Load tidyr
library(tidyr)
# Split event_date_time: sales4
head(sales3$event_date_time)
## [1] "2015-09-12 23:30:00" "2009-09-05 01:00:00" "2006-04-22 01:30:00"
## [4] "2011-09-03 00:00:00" "2005-07-31 01:00:00" "2012-07-22 02:00:00"
sales4 <- separate(sales3, event_date_time, c('event_dt', 'event_time'), sep = " ")
## check new columns
col <- str_detect(names(sales4),'event')
glimpse(sales4[,col])
## Observations: 5,000
## Variables: 7
## $ event_name <chr> "Xfinity Center Mansfield Premier Parki...
## $ la_event_type_cat <chr> "PARKING", "INVALID", "PARKING", "PARKI...
## $ event_disp_name <chr> "Xfinity Center Mansfield Premier Parki...
## $ event_dt <chr> "2015-09-12", "2009-09-05", "2006-04-22...
## $ event_time <chr> "23:30:00", "01:00:00", "01:30:00", "00...
## $ event_dt.1 <chr> "2015-09-12", "2009-09-04", "2006-04-21...
## $ la_valid_tkt_event_flg <chr> "N ", "N ", "N ", "N ", "N ", "N ", "N ...
# Split sales_ord_create_dttm: sales5
head(sales4$sales_ord_create_dttm)
## [1] "2015-09-11 18:17:45" "2009-07-06 00:00:00" "2006-04-05 00:00:00"
## [4] "2011-07-01 17:38:50" "2005-06-18 00:00:00" "2012-07-21 17:20:18"
sales5 <- separate(sales4, sales_ord_create_dttm, c('ord_create_dt', 'ord_create_time'), sep = " ")
## check new columns
col <- str_detect(names(sales5),'ord_create')
glimpse(sales5[,col])
## Observations: 5,000
## Variables: 2
## $ ord_create_dt <chr> "2015-09-11", "2009-07-06", "2006-04-05", "201...
## $ ord_create_time <chr> "18:17:45", "00:00:00", "00:00:00", "17:38:50"...
separate
in the last code block the following warning is printed to the console
Warning message: Too few values at 4 locations: 2516, 3863, 4082, 4183
warnings=F
set on all code chunks# Define an issues vector
issues <- c(2516, 3863, 4082, 4183)
# Print values of sales_ord_create_dttm at these indices
sales3$sales_ord_create_dttm[issues]
## [1] "NULL" "NULL" "NULL" "NULL"
# Print a well-behaved value of sales_ord_create_dttm
sales3$sales_ord_create_dttm[2517]
## [1] "2013-08-04 23:07:19"
# Load stringr
library(stringr)
# Find columns of sales5 containing "dt": date_cols
date_cols <- str_detect(colnames(sales5), "dt")
glimpse(sales5[,date_cols])
## Observations: 5,000
## Variables: 7
## $ event_dt <chr> "2015-09-12", "2009-09-05", "2006-04-22", "2...
## $ event_dt.1 <chr> "2015-09-12", "2009-09-04", "2006-04-21", "2...
## $ presale_dt <chr> "NULL", "NULL", "NULL", "NULL", "2005-03-02"...
## $ onsale_dt <chr> "2015-05-15", "2009-03-13", "2006-02-25", "2...
## $ ord_create_dt <chr> "2015-09-11", "2009-07-06", "2006-04-05", "2...
## $ sales_ord_tran_dt <chr> "2015-09-11", "2009-07-05", "2006-04-05", "2...
## $ print_dt <chr> "2015-09-12", "2009-09-01", "2006-04-05", "2...
# Load lubridate
library(lubridate)
# Coerce date columns into Date objects
sales5[, date_cols] <- lapply(sales5[, date_cols], ymd)
# Check column types
glimpse(sales5[,date_cols])
## Observations: 5,000
## Variables: 7
## $ event_dt <date> 2015-09-12, 2009-09-05, 2006-04-22, 2011-09...
## $ event_dt.1 <date> 2015-09-12, 2009-09-04, 2006-04-21, 2011-09...
## $ presale_dt <date> NA, NA, NA, NA, 2005-03-02, NA, NA, NA, NA,...
## $ onsale_dt <date> 2015-05-15, 2009-03-13, 2006-02-25, 2011-04...
## $ ord_create_dt <date> 2015-09-11, 2009-07-06, 2006-04-05, 2011-07...
## $ sales_ord_tran_dt <date> 2015-09-11, 2009-07-05, 2006-04-05, 2011-07...
## $ print_dt <date> 2015-09-12, 2009-09-01, 2006-04-05, 2011-07...
ymd
function in the lapply above I get the following warnings in the console
Warning message: 2892 failed to parse.
Warning message: 101 failed to parse.
Warning message: 4 failed to parse.
Warning message: 424 failed to parse.
NA
lapply
the ia.na
function on all date columnsTRUE
s## stringr is loaded
# Find date columns (don't change)
date_cols <- str_detect(names(sales5), "dt")
# Create logical vectors indicating missing values (don't change)
missing <- lapply(sales5[, date_cols], is.na)
# Create a numerical vector that counts missing values: num_missing
num_missing <- sapply(missing, sum)
# Print num_missing
num_missing
## event_dt event_dt.1 presale_dt onsale_dt
## 0 0 2892 101
## ord_create_dt sales_ord_tran_dt print_dt
## 4 0 424
## tidyr is loaded
# Combine the venue_city and venue_state columns
sales6 <- unite(sales5, venue_city_state, venue_city, venue_state, sep = ", ")
# View the head of sales6
head(sales6)
## event_name
## 1 Xfinity Center Mansfield Premier Parking: Florida Georgia Line
## 2 Gorge Camping - dave matthews band - sept 3-7
## 3 Dodge Theatre Adams Street Parking - benise
## 4 Gexa Energy Pavilion Vip Parking : kid rock with sheryl crow
## 5 Premier Parking - motley crue
## 6 Fast Lane Access: Journey
## primary_act_name secondary_act_name
## 1 XFINITY Center Mansfield Premier Parking NULL
## 2 Gorge Camping Dave Matthews Band
## 3 Parking Event NULL
## 4 Gexa Energy Pavilion VIP Parking NULL
## 5 White River Amphitheatre Premier Parking NULL
## 6 Fast Lane Access Journey
## major_cat_name minor_cat_name la_event_type_cat
## 1 MISC PARKING PARKING
## 2 MISC CAMPING INVALID
## 3 MISC PARKING PARKING
## 4 MISC PARKING PARKING
## 5 MISC PARKING PARKING
## 6 MISC SPECIAL ENTRY (UPSELL) UPSELL
## event_disp_name
## 1 Xfinity Center Mansfield Premier Parking: Florida Georgia Line
## 2 Gorge Camping - dave matthews band - sept 3-7
## 3 Dodge Theatre Adams Street Parking - benise
## 4 Gexa Energy Pavilion Vip Parking : kid rock with sheryl crow
## 5 Premier Parking - motley crue
## 6 Fast Lane Access: Journey
## ticket_text
## 1 THIS TICKET IS VALID FOR PARKING ONLY GOOD THIS DAY ONLY PREMIER PARKING PASS XFINITY CENTER,LOTS 4 PM SAT SEP 12 2015 7:30 PM
## 2 %OVERNIGHT C A M P I N G%* * * * * *%GORGE CAMPGROUND%* GOOD THIS DATE ONLY *%SEP 3 - 6, 2009
## 3 ADAMS STREET GARAGE%PARKING FOR 4/21/06 ONLY%DODGE THEATRE PARKING PASS%ENTRANCE ON ADAMS STREET%BENISE%GARAGE OPENS AT 6:00PM
## 4 THIS TICKET IS VALID FOR PARKING ONLY GOOD FOR THIS DATE ONLY VIP PARKING PASS GEXA ENERGY PAVILION FRI SEP 02 2011 7:00 PM
## 5 THIS TICKET IS VALID%FOR PARKING ONLY%GOOD THIS DATE ONLY%PREMIER PARKING PASS%WHITE RIVER AMPHITHEATRE%SAT JUL 30, 2005 6:00PM
## 6 FAST LANE JOURNEY FAST LANE EVENT THIS IS NOT A TICKET SAN MANUEL AMPHITHEATER SAT JUL 21 2012 7:00 PM
## tickets_purchased_qty trans_face_val_amt delivery_type_cd event_dt
## 1 1 45 eTicket 2015-09-12
## 2 1 75 TicketFast 2009-09-05
## 3 1 5 TicketFast 2006-04-22
## 4 1 20 Mail 2011-09-03
## 5 1 20 Mail 2005-07-31
## 6 2 10 TicketFast 2012-07-22
## event_time event_dt.1 presale_dt onsale_dt ord_create_dt
## 1 23:30:00 2015-09-12 <NA> 2015-05-15 2015-09-11
## 2 01:00:00 2009-09-04 <NA> 2009-03-13 2009-07-06
## 3 01:30:00 2006-04-21 <NA> 2006-02-25 2006-04-05
## 4 00:00:00 2011-09-02 <NA> 2011-04-22 2011-07-01
## 5 01:00:00 2005-07-30 2005-03-02 2005-03-04 2005-06-18
## 6 02:00:00 2012-07-21 <NA> 2012-04-11 2012-07-21
## ord_create_time sales_ord_tran_dt print_dt timezn_nm
## 1 18:17:45 2015-09-11 2015-09-12 EST
## 2 00:00:00 2009-07-05 2009-09-01 PST
## 3 00:00:00 2006-04-05 2006-04-05 MST
## 4 17:38:50 2011-07-01 2011-07-06 CST
## 5 00:00:00 2005-06-18 2005-06-28 PST
## 6 17:20:18 2012-07-21 2012-07-21 PST
## venue_city_state venue_postal_cd_sgmt_1
## 1 MANSFIELD, MASSACHUSETTS 02048
## 2 QUINCY, WASHINGTON 98848
## 3 PHOENIX, ARIZONA 85003
## 4 DALLAS, TEXAS 75210
## 5 AUBURN, WASHINGTON 98092
## 6 SAN BERNARDINO, CALIFORNIA 92407
## sales_platform_cd print_flg la_valid_tkt_event_flg fin_mkt_nm
## 1 www.concerts.livenation.com T N Boston
## 2 NULL T N Seattle
## 3 NULL T N Arizona
## 4 NULL T N Dallas
## 5 NULL T N Seattle
## 6 www.livenation.com T N Los Angeles
head(sales6$venue_city_state)
## [1] "MANSFIELD, MASSACHUSETTS" "QUINCY, WASHINGTON"
## [3] "PHOENIX, ARIZONA" "DALLAS, TEXAS"
## [5] "AUBURN, WASHINGTON" "SAN BERNARDINO, CALIFORNIA"
# Load readxl
library(readxl)
# Import mbta.xlsx and skip first row: mbta
url_mbta <- 'http://s3.amazonaws.com/assets.datacamp.com/production/course_1294/datasets/mbta.xlsx'
download.file(url_mbta, 'data/mbta.xlsx')
mbta <- read_excel('data/mbta.xlsx', skip=1)
# View the structure of mbta
str(mbta)
## Classes 'tbl_df', 'tbl' and 'data.frame': 11 obs. of 60 variables:
## $ X__1 : num 1 2 3 4 5 6 7 8 9 10 ...
## $ mode : chr "All Modes by Qtr" "Boat" "Bus" "Commuter Rail" ...
## $ 2007-01: chr "NA" "4" "335.819" "142.2" ...
## $ 2007-02: chr "NA" "3.6" "338.675" "138.5" ...
## $ 2007-03: num 1188 40 340 138 459 ...
## $ 2007-04: chr "NA" "4.3" "352.162" "139.5" ...
## $ 2007-05: chr "NA" "4.9" "354.367" "139" ...
## $ 2007-06: num 1246 5.8 350.5 143 477 ...
## $ 2007-07: chr "NA" "6.521" "357.519" "142.391" ...
## $ 2007-08: chr "NA" "6.572" "355.479" "142.364" ...
## $ 2007-09: num 1256.57 5.47 372.6 143.05 499.57 ...
## $ 2007-10: chr "NA" "5.145" "368.847" "146.542" ...
## $ 2007-11: chr "NA" "3.763" "330.826" "145.089" ...
## $ 2007-12: num 1216.89 2.98 312.92 141.59 448.27 ...
## $ 2008-01: chr "NA" "3.175" "340.324" "142.145" ...
## $ 2008-02: chr "NA" "3.111" "352.905" "142.607" ...
## $ 2008-03: num 1253.52 3.51 361.15 137.45 494.05 ...
## $ 2008-04: chr "NA" "4.164" "368.189" "140.389" ...
## $ 2008-05: chr "NA" "4.015" "363.903" "142.585" ...
## $ 2008-06: num 1314.82 5.19 362.96 142.06 518.35 ...
## $ 2008-07: chr "NA" "6.016" "370.921" "145.731" ...
## $ 2008-08: chr "NA" "5.8" "361.057" "144.565" ...
## $ 2008-09: num 1307.04 4.59 389.54 141.91 517.32 ...
## $ 2008-10: chr "NA" "4.285" "357.974" "151.957" ...
## $ 2008-11: chr "NA" "3.488" "345.423" "152.952" ...
## $ 2008-12: num 1232.65 3.01 325.77 140.81 446.74 ...
## $ 2009-01: chr "NA" "3.014" "338.532" "141.448" ...
## $ 2009-02: chr "NA" "3.196" "360.412" "143.529" ...
## $ 2009-03: num 1209.79 3.33 353.69 142.89 467.22 ...
## $ 2009-04: chr "NA" "4.049" "359.38" "142.34" ...
## $ 2009-05: chr "NA" "4.119" "354.75" "144.225" ...
## $ 2009-06: num 1233.1 4.9 347.9 142 473.1 ...
## $ 2009-07: chr "NA" "6.444" "339.477" "137.691" ...
## $ 2009-08: chr "NA" "5.903" "332.661" "139.158" ...
## $ 2009-09: num 1230.5 4.7 374.3 139.1 500.4 ...
## $ 2009-10: chr "NA" "4.212" "385.868" "137.104" ...
## $ 2009-11: chr "NA" "3.576" "366.98" "129.343" ...
## $ 2009-12: num 1207.85 3.11 332.39 126.07 440.93 ...
## $ 2010-01: chr "NA" "3.207" "362.226" "130.91" ...
## $ 2010-02: chr "NA" "3.195" "361.138" "131.918" ...
## $ 2010-03: num 1208.86 3.48 373.44 131.25 483.4 ...
## $ 2010-04: chr "NA" "4.452" "378.611" "131.722" ...
## $ 2010-05: chr "NA" "4.415" "380.171" "128.8" ...
## $ 2010-06: num 1244.41 5.41 363.27 129.14 490.26 ...
## $ 2010-07: chr "NA" "6.513" "353.04" "122.935" ...
## $ 2010-08: chr "NA" "6.269" "343.688" "129.732" ...
## $ 2010-09: num 1225.5 4.7 381.6 132.9 521.1 ...
## $ 2010-10: chr "NA" "4.402" "384.987" "131.033" ...
## $ 2010-11: chr "NA" "3.731" "367.955" "130.889" ...
## $ 2010-12: num 1216.26 3.16 326.34 121.42 450.43 ...
## $ 2011-01: chr "NA" "3.14" "334.958" "128.396" ...
## $ 2011-02: chr "NA" "3.284" "346.234" "125.463" ...
## $ 2011-03: num 1223.45 3.67 380.4 134.37 516.73 ...
## $ 2011-04: chr "NA" "4.251" "380.446" "134.169" ...
## $ 2011-05: chr "NA" "4.431" "385.289" "136.14" ...
## $ 2011-06: num 1302.41 5.47 376.32 135.58 529.53 ...
## $ 2011-07: chr "NA" "6.581" "361.585" "132.41" ...
## $ 2011-08: chr "NA" "6.733" "353.793" "130.616" ...
## $ 2011-09: num 1291 5 388 137 550 ...
## $ 2011-10: chr "NA" "4.484" "398.456" "128.72" ...
# View the first 6 rows of mbta
head(mbta)
## # A tibble: 6 x 60
## X__1 mode `2007-01` `2007-02` `2007-03` `2007-04` `2007-05`
## <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 1 All Modes by Qtr NA NA 1187.653 NA NA
## 2 2 Boat 4 3.6 40.000 4.3 4.9
## 3 3 Bus 335.819 338.675 339.867 352.162 354.367
## 4 4 Commuter Rail 142.2 138.5 137.700 139.5 139
## 5 5 Heavy Rail 435.294 448.271 458.583 472.201 474.579
## 6 6 Light Rail 227.231 240.262 241.444 255.557 248.262
## # ... with 53 more variables: `2007-06` <dbl>, `2007-07` <chr>,
## # `2007-08` <chr>, `2007-09` <dbl>, `2007-10` <chr>, `2007-11` <chr>,
## # `2007-12` <dbl>, `2008-01` <chr>, `2008-02` <chr>, `2008-03` <dbl>,
## # `2008-04` <chr>, `2008-05` <chr>, `2008-06` <dbl>, `2008-07` <chr>,
## # `2008-08` <chr>, `2008-09` <dbl>, `2008-10` <chr>, `2008-11` <chr>,
## # `2008-12` <dbl>, `2009-01` <chr>, `2009-02` <chr>, `2009-03` <dbl>,
## # `2009-04` <chr>, `2009-05` <chr>, `2009-06` <dbl>, `2009-07` <chr>,
## # `2009-08` <chr>, `2009-09` <dbl>, `2009-10` <chr>, `2009-11` <chr>,
## # `2009-12` <dbl>, `2010-01` <chr>, `2010-02` <chr>, `2010-03` <dbl>,
## # `2010-04` <chr>, `2010-05` <chr>, `2010-06` <dbl>, `2010-07` <chr>,
## # `2010-08` <chr>, `2010-09` <dbl>, `2010-10` <chr>, `2010-11` <chr>,
## # `2010-12` <dbl>, `2011-01` <chr>, `2011-02` <chr>, `2011-03` <dbl>,
## # `2011-04` <chr>, `2011-05` <chr>, `2011-06` <dbl>, `2011-07` <chr>,
## # `2011-08` <chr>, `2011-09` <dbl>, `2011-10` <chr>
# View a summary of mbta
summary(mbta)
## X__1 mode 2007-01 2007-02
## Min. : 1.0 Length:11 Length:11 Length:11
## 1st Qu.: 3.5 Class :character Class :character Class :character
## Median : 6.0 Mode :character Mode :character Mode :character
## Mean : 6.0
## 3rd Qu.: 8.5
## Max. :11.0
## 2007-03 2007-04 2007-05
## Min. : 0.114 Length:11 Length:11
## 1st Qu.: 9.278 Class :character Class :character
## Median : 137.700 Mode :character Mode :character
## Mean : 330.293
## 3rd Qu.: 399.225
## Max. :1204.725
## 2007-06 2007-07 2007-08
## Min. : 0.096 Length:11 Length:11
## 1st Qu.: 5.700 Class :character Class :character
## Median : 143.000 Mode :character Mode :character
## Mean : 339.846
## 3rd Qu.: 413.788
## Max. :1246.129
## 2007-09 2007-10 2007-11
## Min. : -0.007 Length:11 Length:11
## 1st Qu.: 5.539 Class :character Class :character
## Median : 143.051 Mode :character Mode :character
## Mean : 352.554
## 3rd Qu.: 436.082
## Max. :1310.764
## 2007-12 2008-01 2008-02
## Min. : -0.060 Length:11 Length:11
## 1st Qu.: 4.385 Class :character Class :character
## Median : 141.585 Mode :character Mode :character
## Mean : 321.588
## 3rd Qu.: 380.594
## Max. :1216.890
## 2008-03 2008-04 2008-05
## Min. : 0.058 Length:11 Length:11
## 1st Qu.: 5.170 Class :character Class :character
## Median : 137.453 Mode :character Mode :character
## Mean : 345.604
## 3rd Qu.: 427.601
## Max. :1274.031
## 2008-06 2008-07 2008-08
## Min. : 0.060 Length:11 Length:11
## 1st Qu.: 5.742 Class :character Class :character
## Median : 142.057 Mode :character Mode :character
## Mean : 359.667
## 3rd Qu.: 440.656
## Max. :1320.728
## 2008-09 2008-10 2008-11
## Min. : 0.021 Length:11 Length:11
## 1st Qu.: 5.691 Class :character Class :character
## Median : 141.907 Mode :character Mode :character
## Mean : 362.099
## 3rd Qu.: 453.430
## Max. :1338.015
## 2008-12 2009-01 2009-02
## Min. : -0.015 Length:11 Length:11
## 1st Qu.: 4.689 Class :character Class :character
## Median : 140.810 Mode :character Mode :character
## Mean : 319.882
## 3rd Qu.: 386.255
## Max. :1232.655
## 2009-03 2009-04 2009-05
## Min. : -0.050 Length:11 Length:11
## 1st Qu.: 5.003 Class :character Class :character
## Median : 142.893 Mode :character Mode :character
## Mean : 330.142
## 3rd Qu.: 410.455
## Max. :1210.912
## 2009-06 2009-07 2009-08
## Min. : -0.079 Length:11 Length:11
## 1st Qu.: 5.845 Class :character Class :character
## Median : 142.006 Mode :character Mode :character
## Mean : 333.194
## 3rd Qu.: 410.482
## Max. :1233.085
## 2009-09 2009-10 2009-11
## Min. : -0.035 Length:11 Length:11
## 1st Qu.: 5.693 Class :character Class :character
## Median : 139.087 Mode :character Mode :character
## Mean : 346.687
## 3rd Qu.: 437.332
## Max. :1291.564
## 2009-12 2010-01 2010-02
## Min. : -0.022 Length:11 Length:11
## 1st Qu.: 4.784 Class :character Class :character
## Median : 126.066 Mode :character Mode :character
## Mean : 312.962
## 3rd Qu.: 386.659
## Max. :1207.845
## 2010-03 2010-04 2010-05
## Min. : 0.012 Length:11 Length:11
## 1st Qu.: 5.274 Class :character Class :character
## Median : 131.252 Mode :character Mode :character
## Mean : 332.726
## 3rd Qu.: 428.420
## Max. :1225.556
## 2010-06 2010-07 2010-08
## Min. : 0.008 Length:11 Length:11
## 1st Qu.: 6.436 Class :character Class :character
## Median : 129.144 Mode :character Mode :character
## Mean : 335.964
## 3rd Qu.: 426.769
## Max. :1244.409
## 2010-09 2010-10 2010-11
## Min. : 0.001 Length:11 Length:11
## 1st Qu.: 5.567 Class :character Class :character
## Median : 132.892 Mode :character Mode :character
## Mean : 346.524
## 3rd Qu.: 451.361
## Max. :1293.117
## 2010-12 2011-01 2011-02
## Min. : -0.004 Length:11 Length:11
## 1st Qu.: 4.466 Class :character Class :character
## Median : 121.422 Mode :character Mode :character
## Mean : 312.917
## 3rd Qu.: 388.385
## Max. :1216.262
## 2011-03 2011-04 2011-05
## Min. : 0.05 Length:11 Length:11
## 1st Qu.: 6.03 Class :character Class :character
## Median : 134.37 Mode :character Mode :character
## Mean : 345.17
## 3rd Qu.: 448.56
## Max. :1286.66
## 2011-06 2011-07 2011-08
## Min. : 0.054 Length:11 Length:11
## 1st Qu.: 6.926 Class :character Class :character
## Median : 135.581 Mode :character Mode :character
## Mean : 353.331
## 3rd Qu.: 452.923
## Max. :1302.414
## 2011-09 2011-10
## Min. : 0.043 Length:11
## 1st Qu.: 6.660 Class :character
## Median : 136.901 Mode :character
## Mean : 362.555
## 3rd Qu.: 469.204
## Max. :1348.754
# Remove rows 1, 7, and 11 of mbta: mbta2
keep <- !(mbta$mode %in% c('All Modes by Qtr', 'Pct Chg / Yr', 'TOTAL'))
mbta2 <- mbta[keep,]
glimpse(mbta2)
## Observations: 8
## Variables: 60
## $ X__1 <dbl> 2, 3, 4, 5, 6, 8, 9, 10
## $ mode <chr> "Boat", "Bus", "Commuter Rail", "Heavy Rail", "Light R...
## $ 2007-01 <chr> "4", "335.819", "142.2", "435.294", "227.231", "4.772"...
## $ 2007-02 <chr> "3.6", "338.675", "138.5", "448.271", "240.262", "4.41...
## $ 2007-03 <dbl> 40.000, 339.867, 137.700, 458.583, 241.444, 4.574, 5.5...
## $ 2007-04 <chr> "4.3", "352.162", "139.5", "472.201", "255.557", "4.54...
## $ 2007-05 <chr> "4.9", "354.367", "139", "474.579", "248.262", "4.768"...
## $ 2007-06 <dbl> 5.800, 350.543, 143.000, 477.032, 246.108, 4.722, 5.60...
## $ 2007-07 <chr> "6.521", "357.519", "142.391", "471.735", "243.286", "...
## $ 2007-08 <chr> "6.572", "355.479", "142.364", "461.605", "234.907", "...
## $ 2007-09 <dbl> 5.469, 372.598, 143.051, 499.566, 265.748, 4.329, 5.60...
## $ 2007-10 <chr> "5.145", "368.847", "146.542", "457.741", "241.434", "...
## $ 2007-11 <chr> "3.763", "330.826", "145.089", "488.348", "250.497", "...
## $ 2007-12 <dbl> 2.985, 312.920, 141.585, 448.268, 233.379, 3.708, 5.06...
## $ 2008-01 <chr> "3.175", "340.324", "142.145", "472.624", "241.223", "...
## $ 2008-02 <chr> "3.111", "352.905", "142.607", "492.1", "249.306", "4....
## $ 2008-03 <dbl> 3.512, 361.155, 137.453, 494.046, 253.132, 4.175, 6.16...
## $ 2008-04 <chr> "4.164", "368.189", "140.389", "513.204", "271.07", "4...
## $ 2008-05 <chr> "4.015", "363.903", "142.585", "507.952", "258.351", "...
## $ 2008-06 <dbl> 5.189, 362.962, 142.057, 518.349, 266.961, 4.400, 6.29...
## $ 2008-07 <chr> "6.016", "370.921", "145.731", "512.309", "270.158", "...
## $ 2008-08 <chr> "5.8", "361.057", "144.565", "476.99", "239.344", "4.2...
## $ 2008-09 <dbl> 4.587, 389.537, 141.907, 517.324, 258.171, 4.878, 6.50...
## $ 2008-10 <chr> "4.285", "357.974", "151.957", "523.644", "250.063", "...
## $ 2008-11 <chr> "3.488", "345.423", "152.952", "487.115", "232.068", "...
## $ 2008-12 <dbl> 3.007, 325.767, 140.810, 446.743, 205.420, 3.420, 5.95...
## $ 2009-01 <chr> "3.014", "338.532", "141.448", "461.004", "215.66", "3...
## $ 2009-02 <chr> "3.196", "360.412", "143.529", "482.407", "228.737", "...
## $ 2009-03 <dbl> 3.330, 353.686, 142.893, 467.224, 222.844, 2.586, 6.67...
## $ 2009-04 <chr> "4.049", "359.38", "142.34", "493.152", "238.232", "2....
## $ 2009-05 <chr> "4.119", "354.75", "144.225", "475.634", "224.962", "2...
## $ 2009-06 <dbl> 4.900, 347.865, 142.006, 473.099, 226.259, 2.763, 6.79...
## $ 2009-07 <chr> "6.444", "339.477", "137.691", "470.828", "230.308", "...
## $ 2009-08 <chr> "5.903", "332.661", "139.158", "466.676", "231.783", "...
## $ 2009-09 <dbl> 4.696, 374.260, 139.087, 500.403, 250.922, 2.716, 6.69...
## $ 2009-10 <chr> "4.212", "385.868", "137.104", "513.406", "230.739", "...
## $ 2009-11 <chr> "3.576", "366.98", "129.343", "480.278", "214.711", "2...
## $ 2009-12 <dbl> 3.113, 332.394, 126.066, 440.925, 194.446, 2.365, 6.45...
## $ 2010-01 <chr> "3.207", "362.226", "130.91", "464.069", "204.396", "2...
## $ 2010-02 <chr> "3.195", "361.138", "131.918", "480.121", "213.136", "...
## $ 2010-03 <dbl> 3.481, 373.443, 131.252, 483.397, 211.693, 2.550, 7.06...
## $ 2010-04 <chr> "4.452", "378.611", "131.722", "502.374", "227.246", "...
## $ 2010-05 <chr> "4.415", "380.171", "128.8", "487.4", "217.805", "2.68...
## $ 2010-06 <dbl> 5.411, 363.275, 129.144, 490.263, 215.922, 2.646, 7.46...
## $ 2010-07 <chr> "6.513", "353.04", "122.935", "488.587", "218.729", "2...
## $ 2010-08 <chr> "6.269", "343.688", "129.732", "473.731", "210.53", "2...
## $ 2010-09 <dbl> 4.699, 381.622, 132.892, 521.099, 236.368, 2.652, 7.34...
## $ 2010-10 <chr> "4.402", "384.987", "131.033", "532.403", "236.366", "...
## $ 2010-11 <chr> "3.731", "367.955", "130.889", "502.887", "221.881", "...
## $ 2010-12 <dbl> 3.156, 326.338, 121.422, 450.433, 196.211, 2.379, 7.19...
## $ 2011-01 <chr> "3.14", "334.958", "128.396", "468.418", "198.45", "2....
## $ 2011-02 <chr> "3.284", "346.234", "125.463", "504.068", "219.886", "...
## $ 2011-03 <dbl> 3.674, 380.399, 134.374, 516.730, 227.935, 2.559, 8.38...
## $ 2011-04 <chr> "4.251", "380.446", "134.169", "528.631", "242.28", "2...
## $ 2011-05 <chr> "4.431", "385.289", "136.14", "528.122", "225.776", "2...
## $ 2011-06 <dbl> 5.474, 376.317, 135.581, 529.528, 221.865, 2.815, 8.37...
## $ 2011-07 <chr> "6.581", "361.585", "132.41", "532.888", "231.01", "2....
## $ 2011-08 <chr> "6.733", "353.793", "130.616", "508.145", "220.164", "...
## $ 2011-09 <dbl> 5.003, 388.271, 136.901, 550.137, 244.949, 2.843, 8.31...
## $ 2011-10 <chr> "4.484", "398.456", "128.72", "554.932", "237.768", "2...
# Remove the first column of mbta2: mbta3
mbta3 <- mbta2[,-1]
glimpse(mbta3)
## Observations: 8
## Variables: 59
## $ mode <chr> "Boat", "Bus", "Commuter Rail", "Heavy Rail", "Light R...
## $ 2007-01 <chr> "4", "335.819", "142.2", "435.294", "227.231", "4.772"...
## $ 2007-02 <chr> "3.6", "338.675", "138.5", "448.271", "240.262", "4.41...
## $ 2007-03 <dbl> 40.000, 339.867, 137.700, 458.583, 241.444, 4.574, 5.5...
## $ 2007-04 <chr> "4.3", "352.162", "139.5", "472.201", "255.557", "4.54...
## $ 2007-05 <chr> "4.9", "354.367", "139", "474.579", "248.262", "4.768"...
## $ 2007-06 <dbl> 5.800, 350.543, 143.000, 477.032, 246.108, 4.722, 5.60...
## $ 2007-07 <chr> "6.521", "357.519", "142.391", "471.735", "243.286", "...
## $ 2007-08 <chr> "6.572", "355.479", "142.364", "461.605", "234.907", "...
## $ 2007-09 <dbl> 5.469, 372.598, 143.051, 499.566, 265.748, 4.329, 5.60...
## $ 2007-10 <chr> "5.145", "368.847", "146.542", "457.741", "241.434", "...
## $ 2007-11 <chr> "3.763", "330.826", "145.089", "488.348", "250.497", "...
## $ 2007-12 <dbl> 2.985, 312.920, 141.585, 448.268, 233.379, 3.708, 5.06...
## $ 2008-01 <chr> "3.175", "340.324", "142.145", "472.624", "241.223", "...
## $ 2008-02 <chr> "3.111", "352.905", "142.607", "492.1", "249.306", "4....
## $ 2008-03 <dbl> 3.512, 361.155, 137.453, 494.046, 253.132, 4.175, 6.16...
## $ 2008-04 <chr> "4.164", "368.189", "140.389", "513.204", "271.07", "4...
## $ 2008-05 <chr> "4.015", "363.903", "142.585", "507.952", "258.351", "...
## $ 2008-06 <dbl> 5.189, 362.962, 142.057, 518.349, 266.961, 4.400, 6.29...
## $ 2008-07 <chr> "6.016", "370.921", "145.731", "512.309", "270.158", "...
## $ 2008-08 <chr> "5.8", "361.057", "144.565", "476.99", "239.344", "4.2...
## $ 2008-09 <dbl> 4.587, 389.537, 141.907, 517.324, 258.171, 4.878, 6.50...
## $ 2008-10 <chr> "4.285", "357.974", "151.957", "523.644", "250.063", "...
## $ 2008-11 <chr> "3.488", "345.423", "152.952", "487.115", "232.068", "...
## $ 2008-12 <dbl> 3.007, 325.767, 140.810, 446.743, 205.420, 3.420, 5.95...
## $ 2009-01 <chr> "3.014", "338.532", "141.448", "461.004", "215.66", "3...
## $ 2009-02 <chr> "3.196", "360.412", "143.529", "482.407", "228.737", "...
## $ 2009-03 <dbl> 3.330, 353.686, 142.893, 467.224, 222.844, 2.586, 6.67...
## $ 2009-04 <chr> "4.049", "359.38", "142.34", "493.152", "238.232", "2....
## $ 2009-05 <chr> "4.119", "354.75", "144.225", "475.634", "224.962", "2...
## $ 2009-06 <dbl> 4.900, 347.865, 142.006, 473.099, 226.259, 2.763, 6.79...
## $ 2009-07 <chr> "6.444", "339.477", "137.691", "470.828", "230.308", "...
## $ 2009-08 <chr> "5.903", "332.661", "139.158", "466.676", "231.783", "...
## $ 2009-09 <dbl> 4.696, 374.260, 139.087, 500.403, 250.922, 2.716, 6.69...
## $ 2009-10 <chr> "4.212", "385.868", "137.104", "513.406", "230.739", "...
## $ 2009-11 <chr> "3.576", "366.98", "129.343", "480.278", "214.711", "2...
## $ 2009-12 <dbl> 3.113, 332.394, 126.066, 440.925, 194.446, 2.365, 6.45...
## $ 2010-01 <chr> "3.207", "362.226", "130.91", "464.069", "204.396", "2...
## $ 2010-02 <chr> "3.195", "361.138", "131.918", "480.121", "213.136", "...
## $ 2010-03 <dbl> 3.481, 373.443, 131.252, 483.397, 211.693, 2.550, 7.06...
## $ 2010-04 <chr> "4.452", "378.611", "131.722", "502.374", "227.246", "...
## $ 2010-05 <chr> "4.415", "380.171", "128.8", "487.4", "217.805", "2.68...
## $ 2010-06 <dbl> 5.411, 363.275, 129.144, 490.263, 215.922, 2.646, 7.46...
## $ 2010-07 <chr> "6.513", "353.04", "122.935", "488.587", "218.729", "2...
## $ 2010-08 <chr> "6.269", "343.688", "129.732", "473.731", "210.53", "2...
## $ 2010-09 <dbl> 4.699, 381.622, 132.892, 521.099, 236.368, 2.652, 7.34...
## $ 2010-10 <chr> "4.402", "384.987", "131.033", "532.403", "236.366", "...
## $ 2010-11 <chr> "3.731", "367.955", "130.889", "502.887", "221.881", "...
## $ 2010-12 <dbl> 3.156, 326.338, 121.422, 450.433, 196.211, 2.379, 7.19...
## $ 2011-01 <chr> "3.14", "334.958", "128.396", "468.418", "198.45", "2....
## $ 2011-02 <chr> "3.284", "346.234", "125.463", "504.068", "219.886", "...
## $ 2011-03 <dbl> 3.674, 380.399, 134.374, 516.730, 227.935, 2.559, 8.38...
## $ 2011-04 <chr> "4.251", "380.446", "134.169", "528.631", "242.28", "2...
## $ 2011-05 <chr> "4.431", "385.289", "136.14", "528.122", "225.776", "2...
## $ 2011-06 <dbl> 5.474, 376.317, 135.581, 529.528, 221.865, 2.815, 8.37...
## $ 2011-07 <chr> "6.581", "361.585", "132.41", "532.888", "231.01", "2....
## $ 2011-08 <chr> "6.733", "353.793", "130.616", "508.145", "220.164", "...
## $ 2011-09 <dbl> 5.003, 388.271, 136.901, 550.137, 244.949, 2.843, 8.31...
## $ 2011-10 <chr> "4.484", "398.456", "128.72", "554.932", "237.768", "2...
## mbta3 is pre-loaded
head(mbta3)
## # A tibble: 6 x 59
## mode `2007-01` `2007-02` `2007-03` `2007-04` `2007-05`
## <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 Boat 4 3.6 40.000 4.3 4.9
## 2 Bus 335.819 338.675 339.867 352.162 354.367
## 3 Commuter Rail 142.2 138.5 137.700 139.5 139
## 4 Heavy Rail 435.294 448.271 458.583 472.201 474.579
## 5 Light Rail 227.231 240.262 241.444 255.557 248.262
## 6 Private Bus 4.772 4.417 4.574 4.542 4.768
## # ... with 53 more variables: `2007-06` <dbl>, `2007-07` <chr>,
## # `2007-08` <chr>, `2007-09` <dbl>, `2007-10` <chr>, `2007-11` <chr>,
## # `2007-12` <dbl>, `2008-01` <chr>, `2008-02` <chr>, `2008-03` <dbl>,
## # `2008-04` <chr>, `2008-05` <chr>, `2008-06` <dbl>, `2008-07` <chr>,
## # `2008-08` <chr>, `2008-09` <dbl>, `2008-10` <chr>, `2008-11` <chr>,
## # `2008-12` <dbl>, `2009-01` <chr>, `2009-02` <chr>, `2009-03` <dbl>,
## # `2009-04` <chr>, `2009-05` <chr>, `2009-06` <dbl>, `2009-07` <chr>,
## # `2009-08` <chr>, `2009-09` <dbl>, `2009-10` <chr>, `2009-11` <chr>,
## # `2009-12` <dbl>, `2010-01` <chr>, `2010-02` <chr>, `2010-03` <dbl>,
## # `2010-04` <chr>, `2010-05` <chr>, `2010-06` <dbl>, `2010-07` <chr>,
## # `2010-08` <chr>, `2010-09` <dbl>, `2010-10` <chr>, `2010-11` <chr>,
## # `2010-12` <dbl>, `2011-01` <chr>, `2011-02` <chr>, `2011-03` <dbl>,
## # `2011-04` <chr>, `2011-05` <chr>, `2011-06` <dbl>, `2011-07` <chr>,
## # `2011-08` <chr>, `2011-09` <dbl>, `2011-10` <chr>
# Load tidyr
library(tidyr)
# Gather columns of mbta3: mbta4
mbta4 <- gather(mbta3, month, thou_riders, -mode)
# View the head of mbta4
head(mbta4)
## # A tibble: 6 x 3
## mode month thou_riders
## <chr> <chr> <chr>
## 1 Boat 2007-01 4
## 2 Bus 2007-01 335.819
## 3 Commuter Rail 2007-01 142.2
## 4 Heavy Rail 2007-01 435.294
## 5 Light Rail 2007-01 227.231
## 6 Private Bus 2007-01 4.772
## mbta4 is pre-loaded
head(mbta4)
## # A tibble: 6 x 3
## mode month thou_riders
## <chr> <chr> <chr>
## 1 Boat 2007-01 4
## 2 Bus 2007-01 335.819
## 3 Commuter Rail 2007-01 142.2
## 4 Heavy Rail 2007-01 435.294
## 5 Light Rail 2007-01 227.231
## 6 Private Bus 2007-01 4.772
# Coerce thou_riders to numeric
mbta4$thou_riders <- as.numeric(mbta4$thou_riders)
## tidyr is pre-loaded
# Spread the contents of mbta4: mbta5
head(mbta4)
## # A tibble: 6 x 3
## mode month thou_riders
## <chr> <chr> <dbl>
## 1 Boat 2007-01 4.000
## 2 Bus 2007-01 335.819
## 3 Commuter Rail 2007-01 142.200
## 4 Heavy Rail 2007-01 435.294
## 5 Light Rail 2007-01 227.231
## 6 Private Bus 2007-01 4.772
mbta5 <- spread(mbta4, mode, thou_riders)
# View the head of mbta5
head(mbta5)
## # A tibble: 6 x 9
## month Boat Bus `Commuter Rail` `Heavy Rail` `Light Rail`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2007-01 4.0 335.819 142.2 435.294 227.231
## 2 2007-02 3.6 338.675 138.5 448.271 240.262
## 3 2007-03 40.0 339.867 137.7 458.583 241.444
## 4 2007-04 4.3 352.162 139.5 472.201 255.557
## 5 2007-05 4.9 354.367 139.0 474.579 248.262
## 6 2007-06 5.8 350.543 143.0 477.032 246.108
## # ... with 3 more variables: `Private Bus` <dbl>, RIDE <dbl>, `Trackless
## # Trolley` <dbl>
# View the head of mbta5
head(mbta5)
## # A tibble: 6 x 9
## month Boat Bus `Commuter Rail` `Heavy Rail` `Light Rail`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2007-01 4.0 335.819 142.2 435.294 227.231
## 2 2007-02 3.6 338.675 138.5 448.271 240.262
## 3 2007-03 40.0 339.867 137.7 458.583 241.444
## 4 2007-04 4.3 352.162 139.5 472.201 255.557
## 5 2007-05 4.9 354.367 139.0 474.579 248.262
## 6 2007-06 5.8 350.543 143.0 477.032 246.108
## # ... with 3 more variables: `Private Bus` <dbl>, RIDE <dbl>, `Trackless
## # Trolley` <dbl>
# Split month column into month and year: mbta6
mbta6 <- separate(mbta5, month, c('year', 'month'))
# View the head of mbta6
head(mbta6)
## # A tibble: 6 x 10
## year month Boat Bus `Commuter Rail` `Heavy Rail` `Light Rail`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2007 01 4.0 335.819 142.2 435.294 227.231
## 2 2007 02 3.6 338.675 138.5 448.271 240.262
## 3 2007 03 40.0 339.867 137.7 458.583 241.444
## 4 2007 04 4.3 352.162 139.5 472.201 255.557
## 5 2007 05 4.9 354.367 139.0 474.579 248.262
## 6 2007 06 5.8 350.543 143.0 477.032 246.108
## # ... with 3 more variables: `Private Bus` <dbl>, RIDE <dbl>, `Trackless
## # Trolley` <dbl>
# View a summary of mbta6
summary(mbta6)
## year month Boat Bus
## Length:58 Length:58 Min. : 2.985 Min. :312.9
## Class :character Class :character 1st Qu.: 3.494 1st Qu.:345.6
## Mode :character Mode :character Median : 4.293 Median :359.9
## Mean : 5.068 Mean :358.6
## 3rd Qu.: 5.356 3rd Qu.:372.2
## Max. :40.000 Max. :398.5
## Commuter Rail Heavy Rail Light Rail Private Bus
## Min. :121.4 Min. :435.3 Min. :194.4 Min. :2.213
## 1st Qu.:131.4 1st Qu.:471.1 1st Qu.:220.6 1st Qu.:2.641
## Median :138.8 Median :487.3 Median :231.9 Median :2.820
## Mean :137.4 Mean :489.3 Mean :233.0 Mean :3.352
## 3rd Qu.:142.4 3rd Qu.:511.3 3rd Qu.:244.5 3rd Qu.:4.167
## Max. :153.0 Max. :554.9 Max. :271.1 Max. :4.878
## RIDE Trackless Trolley
## Min. :4.900 Min. : 5.777
## 1st Qu.:5.965 1st Qu.:11.679
## Median :6.615 Median :12.598
## Mean :6.604 Mean :12.125
## 3rd Qu.:7.149 3rd Qu.:13.320
## Max. :8.598 Max. :15.109
# Generate a histogram of Boat ridership
hist(mbta6$Boat)
# Find the row number of the incorrect value: i
i <- which(mbta6$Boat == 40)
# Replace the incorrect value with 4
mbta6$Boat[i] <- 4
# Generate a histogram of Boat column
hist(mbta6$Boat)
library(dplyr)
library(ggplot2)
## need to use the old mbta data frame because ggplot likes the variables to be melted
head(mbta6)
## # A tibble: 6 x 10
## year month Boat Bus `Commuter Rail` `Heavy Rail` `Light Rail`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2007 01 4.0 335.819 142.2 435.294 227.231
## 2 2007 02 3.6 338.675 138.5 448.271 240.262
## 3 2007 03 4.0 339.867 137.7 458.583 241.444
## 4 2007 04 4.3 352.162 139.5 472.201 255.557
## 5 2007 05 4.9 354.367 139.0 474.579 248.262
## 6 2007 06 5.8 350.543 143.0 477.032 246.108
## # ... with 3 more variables: `Private Bus` <dbl>, RIDE <dbl>, `Trackless
## # Trolley` <dbl>
table(mbta4$mode)
##
## Boat Bus Commuter Rail Heavy Rail
## 58 58 58 58
## Light Rail Private Bus RIDE Trackless Trolley
## 58 58 58 58
mbta_all <- mbta6 %>%
unite(year_mon, year, month, sep = "") %>%
gather(mode, thou_riders, -year_mon)
mbta_boat <- mbta_all %>%
filter(mode %in% c("Boat","Trackless Trolley"))
head(mbta_boat)
## # A tibble: 6 x 3
## year_mon mode thou_riders
## <chr> <chr> <dbl>
## 1 200701 Boat 4.0
## 2 200702 Boat 3.6
## 3 200703 Boat 4.0
## 4 200704 Boat 4.3
## 5 200705 Boat 4.9
## 6 200706 Boat 5.8
table(mbta_boat$mode)
##
## Boat Trackless Trolley
## 58 58
# Look at Boat and Trackless Trolley ridership over time (don't change)
## The old outlier point for boat is still in here
ggplot(mbta_boat, aes(x = year_mon, y = thou_riders, col = mode)) + geom_point() +
scale_x_discrete(name = "Month", breaks = c(200701, 200801, 200901, 201001, 201101)) +
scale_y_continuous(name = "Avg Weekday Ridership (thousands)")
# Look at all T ridership over time (don't change)
ggplot(mbta_all, aes(x = year_mon, y = thou_riders, col = mode)) + geom_point() +
scale_x_discrete(name = "Month", breaks = c(200701, 200801, 200901, 201001, 201101)) +
scale_y_continuous(name = "Avg Weekday Ridership (thousands)")
# Load data.table
library(data.table)
# Import food.csv: food
url_food <- 'http://s3.amazonaws.com/assets.datacamp.com/production/course_1294/datasets/food.csv'
food <- fread(url_food)
# Convert food to a data frame
food <- data.frame(food)
# Loaded dplyr
# View a glimpse of food
glimpse(food)
## Observations: 1,500
## Variables: 160
## $ V1 <chr> "1", "2", "3", "4",...
## $ code <dbl> 1.592115e-311, 2.67...
## $ url <chr> "http://world-en.op...
## $ creator <chr> "sebleouf", "foodor...
## $ created_t <chr> "1424747544", "1450...
## $ created_datetime <chr> "2015-02-24T03:12:2...
## $ last_modified_t <chr> "1438445887", "1450...
## $ last_modified_datetime <chr> "2015-08-01T16:18:0...
## $ product_name <chr> "Confiture de frais...
## $ generic_name <chr> "", "", "Pâtes de f...
## $ quantity <chr> "265 g", "375g", "1...
## $ packaging <chr> "Bocal,Verre", "Pla...
## $ packaging_tags <chr> "bocal,verre", "pla...
## $ brands <chr> "Casino Délices", "...
## $ brands_tags <chr> "casino-delices", "...
## $ categories <chr> "Aliments et boisso...
## $ categories_tags <chr> "en:plant-based-foo...
## $ categories_en <chr> "Plant-based foods ...
## $ origins <chr> "", "", "", "", "Ar...
## $ origins_tags <chr> "", "", "", "", "ar...
## $ manufacturing_places <chr> "France", "Belgium"...
## $ manufacturing_places_tags <chr> "france", "belgium"...
## $ labels <chr> "", "", "", "Vegeta...
## $ labels_tags <chr> "", "", "", "en:veg...
## $ labels_en <chr> "", "", "", "Vegeta...
## $ emb_codes <chr> "EMB 78015", "", ""...
## $ emb_codes_tags <chr> "emb-78015", "", ""...
## $ first_packaging_code_geo <chr> "48.983333,2.066667...
## $ cities <chr> "", "", "", "", "",...
## $ cities_tags <chr> "andresy-yvelines-f...
## $ purchase_places <chr> "Lyon,France", "NSW...
## $ stores <chr> "Casino", "", "", "...
## $ countries <chr> "France", "Australi...
## $ countries_tags <chr> "en:france", "en:au...
## $ countries_en <chr> "France", "Australi...
## $ ingredients_text <chr> "Sucre de canne, fr...
## $ allergens <chr> "", "", "", "", "",...
## $ allergens_en <chr> "", "", "", "", "",...
## $ traces <chr> "Lait,Fruits à coqu...
## $ traces_tags <chr> "en:milk,en:nuts", ...
## $ traces_en <chr> "Milk,Nuts", "", ""...
## $ serving_size <chr> "15 g", "", "", "",...
## $ no_nutriments <lgl> NA, NA, NA, NA, NA,...
## $ additives_n <int> 1, NA, 2, 5, 0, NA,...
## $ additives <chr> "[ sucre-de-canne -...
## $ additives_tags <chr> "en:e440", "", "en:...
## $ additives_en <chr> "E440 - Pectins", "...
## $ ingredients_from_palm_oil_n <int> 0, NA, 0, 0, 0, NA,...
## $ ingredients_from_palm_oil <lgl> NA, NA, NA, NA, NA,...
## $ ingredients_from_palm_oil_tags <chr> "", "", "", "", "",...
## $ ingredients_that_may_be_from_palm_oil_n <int> 0, NA, 0, 1, 0, NA,...
## $ ingredients_that_may_be_from_palm_oil <lgl> NA, NA, NA, NA, NA,...
## $ ingredients_that_may_be_from_palm_oil_tags <chr> "", "", "", "e471-m...
## $ nutrition_grade_uk <lgl> NA, NA, NA, NA, NA,...
## $ nutrition_grade_fr <chr> "d", "", "", "d", "...
## $ pnns_groups_1 <chr> "Sugary snacks", "S...
## $ pnns_groups_2 <chr> "Sweets", "Chocolat...
## $ states <chr> "en:to-be-checked, ...
## $ states_tags <chr> "en:to-be-checked,e...
## $ states_en <chr> "To be checked,Comp...
## $ main_category <chr> "en:plant-based-foo...
## $ main_category_en <chr> "Plant-based foods ...
## $ image_url <chr> "http://en.openfood...
## $ image_small_url <chr> "http://en.openfood...
## $ energy_100g <dbl> 918, NA, NA, 766, 2...
## $ energy_from_fat_100g <dbl> NA, NA, NA, NA, NA,...
## $ fat_100g <dbl> 0.00, NA, NA, 16.70...
## $ saturated_fat_100g <dbl> 0.000, NA, NA, 9.90...
## $ butyric_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ caproic_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ caprylic_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ capric_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ lauric_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ myristic_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ palmitic_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ stearic_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ arachidic_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ behenic_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ lignoceric_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ cerotic_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ montanic_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ melissic_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ monounsaturated_fat_100g <dbl> NA, NA, NA, 2.9, 9....
## $ polyunsaturated_fat_100g <dbl> NA, NA, NA, 3.9, 32...
## $ omega_3_fat_100g <dbl> NA, NA, NA, NA, NA,...
## $ alpha_linolenic_acid_100g <dbl> NA, NA, NA, NA, NA,...
## $ eicosapentaenoic_acid_100g <dbl> NA, NA, NA, NA, NA,...
## $ docosahexaenoic_acid_100g <dbl> NA, NA, NA, NA, NA,...
## $ omega_6_fat_100g <dbl> NA, NA, NA, NA, NA,...
## $ linoleic_acid_100g <dbl> NA, NA, NA, NA, NA,...
## $ arachidonic_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ gamma_linolenic_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ dihomo_gamma_linolenic_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ omega_9_fat_100g <lgl> NA, NA, NA, NA, NA,...
## $ oleic_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ elaidic_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ gondoic_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ mead_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ erucic_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ nervonic_acid_100g <lgl> NA, NA, NA, NA, NA,...
## $ trans_fat_100g <dbl> NA, NA, NA, NA, NA,...
## $ cholesterol_100g <dbl> NA, NA, NA, 0.00020...
## $ carbohydrates_100g <dbl> 54.00, NA, NA, 5.70...
## $ sugars_100g <dbl> 54.00, NA, NA, 4.20...
## $ sucrose_100g <lgl> NA, NA, NA, NA, NA,...
## $ glucose_100g <lgl> NA, NA, NA, NA, NA,...
## $ fructose_100g <int> NA, NA, NA, NA, NA,...
## $ lactose_100g <dbl> NA, NA, NA, NA, NA,...
## $ maltose_100g <lgl> NA, NA, NA, NA, NA,...
## $ maltodextrins_100g <lgl> NA, NA, NA, NA, NA,...
## $ starch_100g <dbl> NA, NA, NA, NA, NA,...
## $ polyols_100g <dbl> NA, NA, NA, NA, NA,...
## $ fiber_100g <dbl> NA, NA, NA, 0.2, 9....
## $ proteins_100g <dbl> 0.00, NA, NA, 2.90,...
## $ casein_100g <dbl> NA, NA, NA, NA, NA,...
## $ serum_proteins_100g <lgl> NA, NA, NA, NA, NA,...
## $ nucleotides_100g <lgl> NA, NA, NA, NA, NA,...
## $ salt_100g <dbl> 0.0000000, NA, NA, ...
## $ sodium_100g <dbl> 0.0000000, NA, NA, ...
## $ alcohol_100g <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_a_100g <dbl> NA, NA, NA, NA, NA,...
## $ beta_carotene_100g <lgl> NA, NA, NA, NA, NA,...
## $ vitamin_d_100g <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_e_100g <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_k_100g <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_c_100g <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_b1_100g <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_b2_100g <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_pp_100g <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_b6_100g <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_b9_100g <dbl> NA, NA, NA, NA, NA,...
## $ vitamin_b12_100g <dbl> NA, NA, NA, NA, NA,...
## $ biotin_100g <dbl> NA, NA, NA, NA, NA,...
## $ pantothenic_acid_100g <dbl> NA, NA, NA, NA, NA,...
## $ silica_100g <dbl> NA, NA, NA, NA, NA,...
## $ bicarbonate_100g <dbl> NA, NA, NA, NA, NA,...
## $ potassium_100g <dbl> NA, NA, NA, NA, NA,...
## $ chloride_100g <dbl> NA, NA, NA, NA, NA,...
## $ calcium_100g <dbl> NA, NA, NA, NA, NA,...
## $ phosphorus_100g <dbl> NA, NA, NA, NA, 1.1...
## $ iron_100g <dbl> NA, NA, NA, NA, 0.0...
## $ magnesium_100g <dbl> NA, NA, NA, NA, 0.1...
## $ zinc_100g <dbl> NA, NA, NA, NA, NA,...
## $ copper_100g <dbl> NA, NA, NA, NA, NA,...
## $ manganese_100g <dbl> NA, NA, NA, NA, NA,...
## $ fluoride_100g <dbl> NA, NA, NA, NA, NA,...
## $ selenium_100g <dbl> NA, NA, NA, NA, NA,...
## $ chromium_100g <lgl> NA, NA, NA, NA, NA,...
## $ molybdenum_100g <lgl> NA, NA, NA, NA, NA,...
## $ iodine_100g <dbl> NA, NA, NA, NA, NA,...
## $ caffeine_100g <lgl> NA, NA, NA, NA, NA,...
## $ taurine_100g <lgl> NA, NA, NA, NA, NA,...
## $ ph_100g <lgl> NA, NA, NA, NA, NA,...
## $ fruits_vegetables_nuts_100g <dbl> 54, NA, NA, NA, NA,...
## $ collagen_meat_protein_ratio_100g <int> NA, NA, NA, NA, NA,...
## $ cocoa_100g <int> NA, NA, NA, NA, NA,...
## $ chlorophyl_100g <lgl> NA, NA, NA, NA, NA,...
## $ carbon_footprint_100g <dbl> NA, NA, NA, NA, NA,...
## $ nutrition_score_fr_100g <int> 11, NA, NA, 11, 17,...
## $ nutrition_score_uk_100g <int> 11, NA, NA, 11, 17,...
# View column names of food
names(food)
## [1] "V1"
## [2] "code"
## [3] "url"
## [4] "creator"
## [5] "created_t"
## [6] "created_datetime"
## [7] "last_modified_t"
## [8] "last_modified_datetime"
## [9] "product_name"
## [10] "generic_name"
## [11] "quantity"
## [12] "packaging"
## [13] "packaging_tags"
## [14] "brands"
## [15] "brands_tags"
## [16] "categories"
## [17] "categories_tags"
## [18] "categories_en"
## [19] "origins"
## [20] "origins_tags"
## [21] "manufacturing_places"
## [22] "manufacturing_places_tags"
## [23] "labels"
## [24] "labels_tags"
## [25] "labels_en"
## [26] "emb_codes"
## [27] "emb_codes_tags"
## [28] "first_packaging_code_geo"
## [29] "cities"
## [30] "cities_tags"
## [31] "purchase_places"
## [32] "stores"
## [33] "countries"
## [34] "countries_tags"
## [35] "countries_en"
## [36] "ingredients_text"
## [37] "allergens"
## [38] "allergens_en"
## [39] "traces"
## [40] "traces_tags"
## [41] "traces_en"
## [42] "serving_size"
## [43] "no_nutriments"
## [44] "additives_n"
## [45] "additives"
## [46] "additives_tags"
## [47] "additives_en"
## [48] "ingredients_from_palm_oil_n"
## [49] "ingredients_from_palm_oil"
## [50] "ingredients_from_palm_oil_tags"
## [51] "ingredients_that_may_be_from_palm_oil_n"
## [52] "ingredients_that_may_be_from_palm_oil"
## [53] "ingredients_that_may_be_from_palm_oil_tags"
## [54] "nutrition_grade_uk"
## [55] "nutrition_grade_fr"
## [56] "pnns_groups_1"
## [57] "pnns_groups_2"
## [58] "states"
## [59] "states_tags"
## [60] "states_en"
## [61] "main_category"
## [62] "main_category_en"
## [63] "image_url"
## [64] "image_small_url"
## [65] "energy_100g"
## [66] "energy_from_fat_100g"
## [67] "fat_100g"
## [68] "saturated_fat_100g"
## [69] "butyric_acid_100g"
## [70] "caproic_acid_100g"
## [71] "caprylic_acid_100g"
## [72] "capric_acid_100g"
## [73] "lauric_acid_100g"
## [74] "myristic_acid_100g"
## [75] "palmitic_acid_100g"
## [76] "stearic_acid_100g"
## [77] "arachidic_acid_100g"
## [78] "behenic_acid_100g"
## [79] "lignoceric_acid_100g"
## [80] "cerotic_acid_100g"
## [81] "montanic_acid_100g"
## [82] "melissic_acid_100g"
## [83] "monounsaturated_fat_100g"
## [84] "polyunsaturated_fat_100g"
## [85] "omega_3_fat_100g"
## [86] "alpha_linolenic_acid_100g"
## [87] "eicosapentaenoic_acid_100g"
## [88] "docosahexaenoic_acid_100g"
## [89] "omega_6_fat_100g"
## [90] "linoleic_acid_100g"
## [91] "arachidonic_acid_100g"
## [92] "gamma_linolenic_acid_100g"
## [93] "dihomo_gamma_linolenic_acid_100g"
## [94] "omega_9_fat_100g"
## [95] "oleic_acid_100g"
## [96] "elaidic_acid_100g"
## [97] "gondoic_acid_100g"
## [98] "mead_acid_100g"
## [99] "erucic_acid_100g"
## [100] "nervonic_acid_100g"
## [101] "trans_fat_100g"
## [102] "cholesterol_100g"
## [103] "carbohydrates_100g"
## [104] "sugars_100g"
## [105] "sucrose_100g"
## [106] "glucose_100g"
## [107] "fructose_100g"
## [108] "lactose_100g"
## [109] "maltose_100g"
## [110] "maltodextrins_100g"
## [111] "starch_100g"
## [112] "polyols_100g"
## [113] "fiber_100g"
## [114] "proteins_100g"
## [115] "casein_100g"
## [116] "serum_proteins_100g"
## [117] "nucleotides_100g"
## [118] "salt_100g"
## [119] "sodium_100g"
## [120] "alcohol_100g"
## [121] "vitamin_a_100g"
## [122] "beta_carotene_100g"
## [123] "vitamin_d_100g"
## [124] "vitamin_e_100g"
## [125] "vitamin_k_100g"
## [126] "vitamin_c_100g"
## [127] "vitamin_b1_100g"
## [128] "vitamin_b2_100g"
## [129] "vitamin_pp_100g"
## [130] "vitamin_b6_100g"
## [131] "vitamin_b9_100g"
## [132] "vitamin_b12_100g"
## [133] "biotin_100g"
## [134] "pantothenic_acid_100g"
## [135] "silica_100g"
## [136] "bicarbonate_100g"
## [137] "potassium_100g"
## [138] "chloride_100g"
## [139] "calcium_100g"
## [140] "phosphorus_100g"
## [141] "iron_100g"
## [142] "magnesium_100g"
## [143] "zinc_100g"
## [144] "copper_100g"
## [145] "manganese_100g"
## [146] "fluoride_100g"
## [147] "selenium_100g"
## [148] "chromium_100g"
## [149] "molybdenum_100g"
## [150] "iodine_100g"
## [151] "caffeine_100g"
## [152] "taurine_100g"
## [153] "ph_100g"
## [154] "fruits_vegetables_nuts_100g"
## [155] "collagen_meat_protein_ratio_100g"
## [156] "cocoa_100g"
## [157] "chlorophyl_100g"
## [158] "carbon_footprint_100g"
## [159] "nutrition_score_fr_100g"
## [160] "nutrition_score_uk_100g"
# View summary of food
summary(food[,1:10])
## V1 code url creator
## Length:1500 Min. :0 Length:1500 Length:1500
## Class :character 1st Qu.:0 Class :character Class :character
## Mode :character Median :0 Mode :character Mode :character
## Mean :0
## 3rd Qu.:0
## Max. :0
## created_t created_datetime last_modified_t
## Length:1500 Length:1500 Length:1500
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## last_modified_datetime product_name generic_name
## Length:1500 Length:1500 Length:1500
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
# View head of food
head(food[,1:10])
## V1 code
## 1 1 1.592115e-311
## 2 2 2.673378e-311
## 3 3 1.613001e-311
## 4 4 3.955509e-311
## 5 5 4.189677e-311
## 6 6 4.333113e-313
## url
## 1 http://world-en.openfoodfacts.org/product/3222475745867/confiture-de-fraise-fraise-des-bois-au-sucre-de-canne-casino-delices
## 2 http://world-en.openfoodfacts.org/product/5410976880110/guylian-sea-shells-selection
## 3 http://world-en.openfoodfacts.org/product/3264750423503/pates-de-fruits-aromatisees-jacquot
## 4 http://world-en.openfoodfacts.org/product/8006040247001/nata-vegetal-a-base-de-soja-valsoia
## 5 http://world-en.openfoodfacts.org/product/8480000340764/semillas-de-girasol-con-cascara-tostadas-aguasal-hacendado
## 6 http://world-en.openfoodfacts.org/product/0087703177727/soft-drink
## creator created_t created_datetime last_modified_t
## 1 sebleouf 1424747544 2015-02-24T03:12:24Z 1438445887
## 2 foodorigins 1450316429 2015-12-17T01:40:29Z 1450817956
## 3 domdom26 1428674916 2015-04-10T14:08:36Z 1428739289
## 4 javichu 1420416591 2015-01-05T00:09:51Z 1420417876
## 5 javichu 1420501121 2015-01-05T23:38:41Z 1445700917
## 6 foodorigins 1437983923 2015-07-27T07:58:43Z 1445577476
## last_modified_datetime
## 1 2015-08-01T16:18:07Z
## 2 2015-12-22T20:59:16Z
## 3 2015-04-11T08:01:29Z
## 4 2015-01-05T00:31:16Z
## 5 2015-10-24T15:35:17Z
## 6 2015-10-23T05:17:56Z
## product_name
## 1 Confiture de fraise fraise des bois au sucre de canne
## 2 Guylian Sea Shells Selection
## 3 Pâtes de fruits aromatisées
## 4 Nata vegetal a base de soja "Valsoia"
## 5 Semillas de girasol con cáscara tostadas aguasal
## 6 Soft Drink
## generic_name
## 1
## 2
## 3 Pâtes de fruits
## 4 Nata vegetal a base de soja
## 5 Semillas de girasol con cáscara tostadas aguasal
## 6
# View structure of food
str(food[,1:10])
## 'data.frame': 1500 obs. of 10 variables:
## $ V1 : chr "1" "2" "3" "4" ...
## $ code : num 1.59e-311 2.67e-311 1.61e-311 3.96e-311 4.19e-311 ...
## $ url : chr "http://world-en.openfoodfacts.org/product/3222475745867/confiture-de-fraise-fraise-des-bois-au-sucre-de-canne-casino-delices" "http://world-en.openfoodfacts.org/product/5410976880110/guylian-sea-shells-selection" "http://world-en.openfoodfacts.org/product/3264750423503/pates-de-fruits-aromatisees-jacquot" "http://world-en.openfoodfacts.org/product/8006040247001/nata-vegetal-a-base-de-soja-valsoia" ...
## $ creator : chr "sebleouf" "foodorigins" "domdom26" "javichu" ...
## $ created_t : chr "1424747544" "1450316429" "1428674916" "1420416591" ...
## $ created_datetime : chr "2015-02-24T03:12:24Z" "2015-12-17T01:40:29Z" "2015-04-10T14:08:36Z" "2015-01-05T00:09:51Z" ...
## $ last_modified_t : chr "1438445887" "1450817956" "1428739289" "1420417876" ...
## $ last_modified_datetime: chr "2015-08-01T16:18:07Z" "2015-12-22T20:59:16Z" "2015-04-11T08:01:29Z" "2015-01-05T00:31:16Z" ...
## $ product_name : chr "Confiture de fraise fraise des bois au sucre de canne" "Guylian Sea Shells Selection" "Pâtes de fruits aromatisées" "Nata vegetal a base de soja "Valsoia"" ...
## $ generic_name : chr "" "" "Pâtes de fruits" "Nata vegetal a base de soja" ...
plasti
# Define vector of duplicate cols (don't change)
duplicates <- c(4, 6, 11, 13, 15, 17, 18, 20, 22,
24, 25, 28, 32, 34, 36, 38, 40,
44, 46, 48, 51, 54, 65, 158)
# Remove duplicates from food: food2
food2 <- food[,-duplicates]
# Define useless vector (don't change)
useless <- c(1, 2, 3, 32:41)
# Remove useless columns from food2: food3
food3 <- food2[,-useless]
# Create vector of column indices: nutrition
nutrition <- str_detect(names(food3),"100g")
# View a summary of nutrition columns
summary(food3[,nutrition])
## energy_from_fat_100g fat_100g saturated_fat_100g
## Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 35.98 1st Qu.: 0.90 1st Qu.: 0.200
## Median : 237.00 Median : 6.00 Median : 1.700
## Mean : 668.41 Mean : 13.39 Mean : 4.874
## 3rd Qu.: 974.00 3rd Qu.: 20.00 3rd Qu.: 6.500
## Max. :2900.00 Max. :100.00 Max. :57.000
## NA's :1486 NA's :708 NA's :797
## butyric_acid_100g caproic_acid_100g caprylic_acid_100g capric_acid_100g
## Mode:logical Mode:logical Mode:logical Mode:logical
## NA's:1500 NA's:1500 NA's:1500 NA's:1500
##
##
##
##
##
## lauric_acid_100g myristic_acid_100g palmitic_acid_100g stearic_acid_100g
## Mode:logical Mode:logical Mode:logical Mode:logical
## NA's:1500 NA's:1500 NA's:1500 NA's:1500
##
##
##
##
##
## arachidic_acid_100g behenic_acid_100g lignoceric_acid_100g
## Mode:logical Mode:logical Mode:logical
## NA's:1500 NA's:1500 NA's:1500
##
##
##
##
##
## cerotic_acid_100g montanic_acid_100g melissic_acid_100g
## Mode:logical Mode:logical Mode:logical
## NA's:1500 NA's:1500 NA's:1500
##
##
##
##
##
## monounsaturated_fat_100g polyunsaturated_fat_100g omega_3_fat_100g
## Min. : 0.00 Min. : 0.400 Min. : 0.033
## 1st Qu.: 3.87 1st Qu.: 1.653 1st Qu.: 1.300
## Median : 9.50 Median : 3.900 Median : 3.000
## Mean :19.77 Mean : 9.986 Mean : 3.726
## 3rd Qu.:29.00 3rd Qu.:12.700 3rd Qu.: 3.200
## Max. :75.00 Max. :46.200 Max. :12.400
## NA's :1465 NA's :1464 NA's :1491
## alpha_linolenic_acid_100g eicosapentaenoic_acid_100g
## Min. :0.0800 Min. :0.721
## 1st Qu.:0.0905 1st Qu.:0.721
## Median :0.1010 Median :0.721
## Mean :0.1737 Mean :0.721
## 3rd Qu.:0.2205 3rd Qu.:0.721
## Max. :0.3400 Max. :0.721
## NA's :1497 NA's :1499
## docosahexaenoic_acid_100g omega_6_fat_100g linoleic_acid_100g
## Min. :1.09 Min. :0.25 Min. :0.5000
## 1st Qu.:1.09 1st Qu.:0.25 1st Qu.:0.5165
## Median :1.09 Median :0.25 Median :0.5330
## Mean :1.09 Mean :0.25 Mean :0.5330
## 3rd Qu.:1.09 3rd Qu.:0.25 3rd Qu.:0.5495
## Max. :1.09 Max. :0.25 Max. :0.5660
## NA's :1499 NA's :1499 NA's :1498
## arachidonic_acid_100g gamma_linolenic_acid_100g
## Mode:logical Mode:logical
## NA's:1500 NA's:1500
##
##
##
##
##
## dihomo_gamma_linolenic_acid_100g omega_9_fat_100g oleic_acid_100g
## Mode:logical Mode:logical Mode:logical
## NA's:1500 NA's:1500 NA's:1500
##
##
##
##
##
## elaidic_acid_100g gondoic_acid_100g mead_acid_100g erucic_acid_100g
## Mode:logical Mode:logical Mode:logical Mode:logical
## NA's:1500 NA's:1500 NA's:1500 NA's:1500
##
##
##
##
##
## nervonic_acid_100g trans_fat_100g cholesterol_100g carbohydrates_100g
## Mode:logical Min. :0.0000 Min. :0.0000 Min. : 0.000
## NA's:1500 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 3.792
## Median :0.0000 Median :0.0000 Median : 13.500
## Mean :0.0105 Mean :0.0265 Mean : 27.958
## 3rd Qu.:0.0000 3rd Qu.:0.0026 3rd Qu.: 55.000
## Max. :0.1000 Max. :0.4300 Max. :100.000
## NA's :1481 NA's :1477 NA's :708
## sugars_100g sucrose_100g glucose_100g fructose_100g
## Min. : 0.00 Mode:logical Mode:logical Min. :100
## 1st Qu.: 1.00 NA's:1500 NA's:1500 1st Qu.:100
## Median : 4.05 Median :100
## Mean : 12.66 Mean :100
## 3rd Qu.: 14.70 3rd Qu.:100
## Max. :100.00 Max. :100
## NA's :788 NA's :1499
## lactose_100g maltose_100g maltodextrins_100g starch_100g
## Min. :0.000 Mode:logical Mode:logical Min. : 0.00
## 1st Qu.:0.250 NA's:1500 NA's:1500 1st Qu.: 9.45
## Median :0.500 Median :39.50
## Mean :2.933 Mean :30.73
## 3rd Qu.:4.400 3rd Qu.:42.85
## Max. :8.300 Max. :71.00
## NA's :1497 NA's :1493
## polyols_100g fiber_100g proteins_100g casein_100g
## Min. : 8.60 Min. : 0.000 Min. : 0.000 Min. :1.1
## 1st Qu.:59.10 1st Qu.: 0.500 1st Qu.: 1.500 1st Qu.:1.1
## Median :67.00 Median : 1.750 Median : 6.000 Median :1.1
## Mean :56.06 Mean : 2.823 Mean : 7.563 Mean :1.1
## 3rd Qu.:69.80 3rd Qu.: 3.500 3rd Qu.:10.675 3rd Qu.:1.1
## Max. :70.00 Max. :46.700 Max. :61.000 Max. :1.1
## NA's :1491 NA's :994 NA's :710 NA's :1499
## serum_proteins_100g nucleotides_100g salt_100g sodium_100g
## Mode:logical Mode:logical Min. : 0.0000 Min. : 0.0000
## NA's:1500 NA's:1500 1st Qu.: 0.0438 1st Qu.: 0.0172
## Median : 0.4498 Median : 0.1771
## Mean : 1.1205 Mean : 0.4409
## 3rd Qu.: 1.1938 3rd Qu.: 0.4700
## Max. :102.0000 Max. :40.0000
## NA's :780 NA's :780
## alcohol_100g vitamin_a_100g beta_carotene_100g vitamin_d_100g
## Min. : 0.00 Min. :0.0000 Mode:logical Min. :0e+00
## 1st Qu.: 0.00 1st Qu.:0.0000 NA's:1500 1st Qu.:0e+00
## Median : 5.50 Median :0.0001 Median :0e+00
## Mean :10.07 Mean :0.0003 Mean :0e+00
## 3rd Qu.:13.00 3rd Qu.:0.0006 3rd Qu.:0e+00
## Max. :50.00 Max. :0.0013 Max. :1e-04
## NA's :1433 NA's :1477 NA's :1485
## vitamin_e_100g vitamin_k_100g vitamin_c_100g vitamin_b1_100g
## Min. :0.0005 Min. :0 Min. :0.000 Min. :0.0001
## 1st Qu.:0.0021 1st Qu.:0 1st Qu.:0.002 1st Qu.:0.0003
## Median :0.0044 Median :0 Median :0.019 Median :0.0004
## Mean :0.0069 Mean :0 Mean :0.025 Mean :0.0006
## 3rd Qu.:0.0097 3rd Qu.:0 3rd Qu.:0.030 3rd Qu.:0.0010
## Max. :0.0320 Max. :0 Max. :0.217 Max. :0.0013
## NA's :1478 NA's :1498 NA's :1459 NA's :1478
## vitamin_b2_100g vitamin_pp_100g vitamin_b6_100g vitamin_b9_100g
## Min. :0.0002 Min. :0.0006 Min. :0.0001 Min. :0e+00
## 1st Qu.:0.0003 1st Qu.:0.0033 1st Qu.:0.0002 1st Qu.:0e+00
## Median :0.0009 Median :0.0069 Median :0.0008 Median :1e-04
## Mean :0.0011 Mean :0.0086 Mean :0.0112 Mean :1e-04
## 3rd Qu.:0.0013 3rd Qu.:0.0140 3rd Qu.:0.0012 3rd Qu.:2e-04
## Max. :0.0066 Max. :0.0160 Max. :0.2000 Max. :2e-04
## NA's :1483 NA's :1484 NA's :1481 NA's :1483
## vitamin_b12_100g biotin_100g pantothenic_acid_100g silica_100g
## Min. :0 Min. :0 Min. :0.0000 Min. :8e-04
## 1st Qu.:0 1st Qu.:0 1st Qu.:0.0007 1st Qu.:8e-04
## Median :0 Median :0 Median :0.0020 Median :8e-04
## Mean :0 Mean :0 Mean :0.0027 Mean :8e-04
## 3rd Qu.:0 3rd Qu.:0 3rd Qu.:0.0051 3rd Qu.:8e-04
## Max. :0 Max. :0 Max. :0.0060 Max. :8e-04
## NA's :1489 NA's :1498 NA's :1486 NA's :1499
## bicarbonate_100g potassium_100g chloride_100g calcium_100g
## Min. :0.0006 Min. :0.0000 Min. :0.0003 Min. :0.0000
## 1st Qu.:0.0678 1st Qu.:0.0650 1st Qu.:0.0006 1st Qu.:0.0450
## Median :0.1350 Median :0.1940 Median :0.0009 Median :0.1200
## Mean :0.1692 Mean :0.3288 Mean :0.0144 Mean :0.2040
## 3rd Qu.:0.2535 3rd Qu.:0.3670 3rd Qu.:0.0214 3rd Qu.:0.1985
## Max. :0.3720 Max. :1.4300 Max. :0.0420 Max. :1.0000
## NA's :1497 NA's :1487 NA's :1497 NA's :1449
## phosphorus_100g iron_100g magnesium_100g zinc_100g
## Min. :0.0430 Min. :0.0000 Min. :0.0000 Min. :0.0005
## 1st Qu.:0.1938 1st Qu.:0.0012 1st Qu.:0.0670 1st Qu.:0.0009
## Median :0.3185 Median :0.0042 Median :0.1040 Median :0.0017
## Mean :0.3777 Mean :0.0045 Mean :0.1066 Mean :0.0016
## 3rd Qu.:0.4340 3rd Qu.:0.0077 3rd Qu.:0.1300 3rd Qu.:0.0022
## Max. :1.1550 Max. :0.0137 Max. :0.3330 Max. :0.0026
## NA's :1488 NA's :1463 NA's :1479 NA's :1493
## copper_100g manganese_100g fluoride_100g selenium_100g
## Min. :0e+00 Min. :0 Min. :0 Min. :0
## 1st Qu.:1e-04 1st Qu.:0 1st Qu.:0 1st Qu.:0
## Median :1e-04 Median :0 Median :0 Median :0
## Mean :1e-04 Mean :0 Mean :0 Mean :0
## 3rd Qu.:1e-04 3rd Qu.:0 3rd Qu.:0 3rd Qu.:0
## Max. :1e-04 Max. :0 Max. :0 Max. :0
## NA's :1498 NA's :1499 NA's :1498 NA's :1499
## chromium_100g molybdenum_100g iodine_100g caffeine_100g
## Mode:logical Mode:logical Min. :0 Mode:logical
## NA's:1500 NA's:1500 1st Qu.:0 NA's:1500
## Median :0
## Mean :0
## 3rd Qu.:0
## Max. :0
## NA's :1499
## taurine_100g ph_100g fruits_vegetables_nuts_100g
## Mode:logical Mode:logical Min. : 2.00
## NA's:1500 NA's:1500 1st Qu.:11.25
## Median :42.00
## Mean :36.88
## 3rd Qu.:52.25
## Max. :80.00
## NA's :1470
## collagen_meat_protein_ratio_100g cocoa_100g chlorophyl_100g
## Min. :12.00 Min. :30 Mode:logical
## 1st Qu.:13.50 1st Qu.:47 NA's:1500
## Median :15.00 Median :60
## Mean :15.67 Mean :57
## 3rd Qu.:17.50 3rd Qu.:70
## Max. :20.00 Max. :81
## NA's :1497 NA's :1491
## nutrition_score_fr_100g nutrition_score_uk_100g
## Min. :-12.000 Min. :-12.000
## 1st Qu.: 1.000 1st Qu.: 0.000
## Median : 7.000 Median : 6.000
## Mean : 7.941 Mean : 7.631
## 3rd Qu.: 15.000 3rd Qu.: 16.000
## Max. : 28.000 Max. : 28.000
## NA's :825 NA's :825
# Find indices of sugar NA values: missing
missing <- is.na(food3$sugars_100g)
# Replace NA values with 0
food3$sugars_100g[missing] <- 0
# Create first histogram
hist(food3$sugars_100g, breaks=100)
# Create food4
food4 <- food3[food3$sugars_100g != 0, ]
# Create second histogram
hist(food4$sugars_100g, breaks=100)
packagin_tags
column was removed in an earlier exercise. that maybe a mistake# Find entries containing "plasti": plastic
plastic <- str_detect(food$packaging_tags,"plasti")
# Print the sum of plastic
sum(plastic)
## [1] 540
# Load the gdata package
library(gdata)
# Import the spreadsheet: att
url_att <- 'http://s3.amazonaws.com/assets.datacamp.com/production/course_1294/datasets/attendance.xls'
att <- read.xls(url_att)
# Print the column names
names(att)
## [1] "Table.43..Average.daily.attendance..ADA..as.a.percentage.of.total.enrollment..school.day.length..and.school.year.length.in.public.schools..by.school.level.and.state..2007.08"
## [2] "X"
## [3] "X.1"
## [4] "X.2"
## [5] "X.3"
## [6] "X.4"
## [7] "X.5"
## [8] "X.6"
## [9] "X.7"
## [10] "X.8"
## [11] "X.9"
## [12] "X.10"
## [13] "X.11"
## [14] "X.12"
## [15] "X.13"
## [16] "X.14"
## [17] "X.15"
# Print the first 6 rows
head(att)
## Table.43..Average.daily.attendance..ADA..as.a.percentage.of.total.enrollment..school.day.length..and.school.year.length.in.public.schools..by.school.level.and.state..2007.08
## 1
## 2
## 3 1
## 4 United States ........
## 5 Alabama .................
## 6 Alaska ..................
## X
## 1 Total elementary, secondary, and combined elementary/secondary schools
## 2 ADA as percent of enrollment
## 3 2
## 4 93.1
## 5 93.8
## 6 89.9
## X.1 X.2 X.3 X.4
## 1
## 2 Average hours in school day Average days in school year
## 3 3 4
## 4 (0.22) 6.6 (0.02) 180
## 5 (1.24) 7.0 (0.07) 180
## 6 (1.22) 6.5 (0.05) 180
## X.5 X.6 X.7 X.8
## 1 Elementary schools
## 2 Average hours in school year ADA as percent of enrollment
## 3 5 6
## 4 (0.1) 1,193 (3.1) 94.0
## 5 (0.8) 1,267 (12.3) 93.8
## 6 (3.4) 1,163 (22.9) 91.3
## X.9 X.10 X.11 X.12
## 1 Secondary schools
## 2 Average hours in school day ADA as percent of enrollment
## 3 7 8
## 4 (0.27) 6.7 (0.02) 91.1
## 5 (1.84) 7.0 (0.08) 94.6
## 6 (1.56) 6.5 (0.05) 93.2
## X.13 X.14 X.15
## 1
## 2 Average hours in school day
## 3 9
## 4 (0.43) 6.6 (0.04)
## 5 (0.38) 7.1 (0.17)
## 6 (1.57) 6.2 (0.15)
# Print the last 6 rows
tail(att)
## Table.43..Average.daily.attendance..ADA..as.a.percentage.of.total.enrollment..school.day.length..and.school.year.length.in.public.schools..by.school.level.and.state..2007.08
## 54 Wisconsin ...............
## 55 Wyoming .................
## 56 †Not applicable.
## 57 ‡Reporting standards not met (too few cases).
## 58 NOTE: Averages reflect data reported by schools rather than state requirements. School-reported length of day may exceed state requirements, and there is a range of statistical error in reported estimates. Standard errors appear in parentheses.
## 59 SOURCE: U.S. Department of Education, National Center for Education Statistics, Schools and Staffing Survey (SASS), \\Public School Questionnaire
## X X.1 X.2
## 54 95.0 (0.57) 6.9
## 55 92.4 (1.15) 6.9
## 56
## 57
## 58
## 59 \\ 2003-04 and 2007-08. (This table was prepared June 2011.)
## X.3 X.4 X.5 X.6 X.7 X.8 X.9 X.10 X.11 X.12 X.13 X.14
## 54 (0.04) 180 (0.7) 1,246 (8.6) 95.4 (0.41) 6.9 (0.05) 93.0 (1.91) 7.0
## 55 (0.05) 175 (1.3) 1,201 (8.3) 92.2 (1.65) 6.9 (0.05) 92.4 (0.75) 7.0
## 56
## 57
## 58
## 59
## X.15
## 54 (0.14)
## 55 (0.07)
## 56
## 57
## 58
## 59
# Print the structure
str(att)
## 'data.frame': 59 obs. of 17 variables:
## $ Table.43..Average.daily.attendance..ADA..as.a.percentage.of.total.enrollment..school.day.length..and.school.year.length.in.public.schools..by.school.level.and.state..2007.08: Factor w/ 58 levels ""," United States ........",..: 1 1 5 2 6 7 8 9 10 11 ...
## $ X : Factor w/ 42 levels "","\\ 2003-04 and 2007-08. (This table was prepared June 2011.)",..: 42 41 3 22 28 8 6 14 23 29 ...
## $ X.1 : Factor w/ 45 levels "","(0.22)","(0.23)",..: 1 1 1 2 22 21 41 27 14 6 ...
## $ X.2 : Factor w/ 14 levels "","3","6.2","6.3",..: 1 14 2 7 11 6 5 10 3 11 ...
## $ X.3 : Factor w/ 14 levels "","(0.02)","(0.03)",..: 1 1 1 2 7 5 9 6 7 5 ...
## $ X.4 : Factor w/ 15 levels "","171","172",..: 1 15 14 10 10 10 11 9 11 2 ...
## $ X.5 : Factor w/ 22 levels "","(0.0)","(0.1)",..: 1 1 1 3 10 21 19 4 6 12 ...
## $ X.6 : Factor w/ 48 levels "","1,102","1,117",..: 1 48 47 26 45 15 13 36 5 28 ...
## $ X.7 : Factor w/ 48 levels "","(10.1)","(10.3)",..: 1 1 1 36 11 35 22 6 13 48 ...
## $ X.8 : Factor w/ 40 levels "","6","81.0",..: 40 39 2 22 20 9 5 13 27 25 ...
## $ X.9 : Factor w/ 51 levels "","(0.24)","(0.25)",..: 1 1 1 4 32 25 48 35 20 13 ...
## $ X.10 : Factor w/ 14 levels "","6.2","6.3",..: 1 14 10 7 11 5 4 9 3 11 ...
## $ X.11 : Factor w/ 13 levels "","(0.02)","(0.03)",..: 1 1 1 2 8 5 10 8 5 7 ...
## $ X.12 : Factor w/ 41 levels "","‡","8","85.8",..: 41 40 3 19 35 30 8 18 9 20 ...
## $ X.13 : Factor w/ 47 levels "","(†)","(0.35)",..: 1 1 1 7 5 25 43 35 23 20 ...
## $ X.14 : Factor w/ 17 levels "","‡","5.9","6.1",..: 1 17 16 9 14 5 7 11 4 13 ...
## $ X.15 : Factor w/ 29 levels "","(†)","(0.03)",..: 1 1 1 4 15 13 21 8 18 9 ...
# Create remove
remove <- c(3,56:59)
# Create att2
att2 <- att[-remove,]
# Create remove
remove <- seq(3,17,2)
# Create att3
att3 <- att2[,-remove]
head(att3)
## Table.43..Average.daily.attendance..ADA..as.a.percentage.of.total.enrollment..school.day.length..and.school.year.length.in.public.schools..by.school.level.and.state..2007.08
## 1
## 2
## 4 United States ........
## 5 Alabama .................
## 6 Alaska ..................
## 7 Arizona .................
## X
## 1 Total elementary, secondary, and combined elementary/secondary schools
## 2 ADA as percent of enrollment
## 4 93.1
## 5 93.8
## 6 89.9
## 7 89.0
## X.2 X.4
## 1
## 2 Average hours in school day Average days in school year
## 4 6.6 180
## 5 7.0 180
## 6 6.5 180
## 7 6.4 181
## X.6 X.8
## 1 Elementary schools
## 2 Average hours in school year ADA as percent of enrollment
## 4 1,193 94.0
## 5 1,267 93.8
## 6 1,163 91.3
## 7 1,159 88.9
## X.10 X.12
## 1 Secondary schools
## 2 Average hours in school day ADA as percent of enrollment
## 4 6.7 91.1
## 5 7.0 94.6
## 6 6.5 93.2
## 7 6.4 89.0
## X.14
## 1
## 2 Average hours in school day
## 4 6.6
## 5 7.1
## 6 6.2
## 7 6.4
# Subset just elementary schools: att_elem
att_elem <- att3[,c(1,6,7)]
head(att_elem)
## Table.43..Average.daily.attendance..ADA..as.a.percentage.of.total.enrollment..school.day.length..and.school.year.length.in.public.schools..by.school.level.and.state..2007.08
## 1
## 2
## 4 United States ........
## 5 Alabama .................
## 6 Alaska ..................
## 7 Arizona .................
## X.8 X.10
## 1 Elementary schools
## 2 ADA as percent of enrollment Average hours in school day
## 4 94.0 6.7
## 5 93.8 7.0
## 6 91.3 6.5
## 7 88.9 6.4
# Subset just secondary schools: att_sec
att_sec <- att3[,c(1,8,9)]
head(att_sec)
## Table.43..Average.daily.attendance..ADA..as.a.percentage.of.total.enrollment..school.day.length..and.school.year.length.in.public.schools..by.school.level.and.state..2007.08
## 1
## 2
## 4 United States ........
## 5 Alabama .................
## 6 Alaska ..................
## 7 Arizona .................
## X.12 X.14
## 1 Secondary schools
## 2 ADA as percent of enrollment Average hours in school day
## 4 91.1 6.6
## 5 94.6 7.1
## 6 93.2 6.2
## 7 89.0 6.4
# Subset all schools: att4
att4 <- att3[,1:5]
head(att4)
## Table.43..Average.daily.attendance..ADA..as.a.percentage.of.total.enrollment..school.day.length..and.school.year.length.in.public.schools..by.school.level.and.state..2007.08
## 1
## 2
## 4 United States ........
## 5 Alabama .................
## 6 Alaska ..................
## 7 Arizona .................
## X
## 1 Total elementary, secondary, and combined elementary/secondary schools
## 2 ADA as percent of enrollment
## 4 93.1
## 5 93.8
## 6 89.9
## 7 89.0
## X.2 X.4
## 1
## 2 Average hours in school day Average days in school year
## 4 6.6 180
## 5 7.0 180
## 6 6.5 180
## 7 6.4 181
## X.6
## 1
## 2 Average hours in school year
## 4 1,193
## 5 1,267
## 6 1,163
## 7 1,159
# Define cnames vector (don't change)
cnames <- c("state", "avg_attend_pct", "avg_hr_per_day",
"avg_day_per_yr", "avg_hr_per_yr")
# Assign column names of att4
colnames(att4) <- cnames
head(att4)
## state
## 1
## 2
## 4 United States ........
## 5 Alabama .................
## 6 Alaska ..................
## 7 Arizona .................
## avg_attend_pct
## 1 Total elementary, secondary, and combined elementary/secondary schools
## 2 ADA as percent of enrollment
## 4 93.1
## 5 93.8
## 6 89.9
## 7 89.0
## avg_hr_per_day avg_day_per_yr
## 1
## 2 Average hours in school day Average days in school year
## 4 6.6 180
## 5 7.0 180
## 6 6.5 180
## 7 6.4 181
## avg_hr_per_yr
## 1
## 2 Average hours in school year
## 4 1,193
## 5 1,267
## 6 1,163
## 7 1,159
# Remove first two rows of att4: att5
att5 <- att4[-c(1,2),]
# View the names of att5
names(att5)
## [1] "state" "avg_attend_pct" "avg_hr_per_day" "avg_day_per_yr"
## [5] "avg_hr_per_yr"
head(att5)
## state avg_attend_pct avg_hr_per_day avg_day_per_yr
## 4 United States ........ 93.1 6.6 180
## 5 Alabama ................. 93.8 7.0 180
## 6 Alaska .................. 89.9 6.5 180
## 7 Arizona ................. 89.0 6.4 181
## 8 Arkansas ................ 91.8 6.9 179
## 9 California .............. 93.2 6.2 181
## avg_hr_per_yr
## 4 1,193
## 5 1,267
## 6 1,163
## 7 1,159
## 8 1,229
## 9 1,129
## stringr and att5 are pre-loaded
head(att5)
## state avg_attend_pct avg_hr_per_day avg_day_per_yr
## 4 United States ........ 93.1 6.6 180
## 5 Alabama ................. 93.8 7.0 180
## 6 Alaska .................. 89.9 6.5 180
## 7 Arizona ................. 89.0 6.4 181
## 8 Arkansas ................ 91.8 6.9 179
## 9 California .............. 93.2 6.2 181
## avg_hr_per_yr
## 4 1,193
## 5 1,267
## 6 1,163
## 7 1,159
## 8 1,229
## 9 1,129
# Remove all periods in state column
att5$state <- str_replace_all(att5$state, "\\.","")
head(att5)
## state avg_attend_pct avg_hr_per_day avg_day_per_yr
## 4 United States 93.1 6.6 180
## 5 Alabama 93.8 7.0 180
## 6 Alaska 89.9 6.5 180
## 7 Arizona 89.0 6.4 181
## 8 Arkansas 91.8 6.9 179
## 9 California 93.2 6.2 181
## avg_hr_per_yr
## 4 1,193
## 5 1,267
## 6 1,163
## 7 1,159
## 8 1,229
## 9 1,129
# Remove white space around state names
att5$state <- str_trim(att5$state)
# View the head of att5
head(att5)
## state avg_attend_pct avg_hr_per_day avg_day_per_yr avg_hr_per_yr
## 4 United States 93.1 6.6 180 1,193
## 5 Alabama 93.8 7.0 180 1,267
## 6 Alaska 89.9 6.5 180 1,163
## 7 Arizona 89.0 6.4 181 1,159
## 8 Arkansas 91.8 6.9 179 1,229
## 9 California 93.2 6.2 181 1,129
# Change columns to numeric using dplyr (don't change)
library(dplyr)
example <- mutate_each(att5, funs(as.numeric), -state)
str(example)
## 'data.frame': 52 obs. of 5 variables:
## $ state : chr "United States" "Alabama" "Alaska" "Arizona" ...
## $ avg_attend_pct: num 22 28 8 6 14 23 29 5 7 11 ...
## $ avg_hr_per_day: num 7 11 6 5 10 3 11 6 8 10 ...
## $ avg_day_per_yr: num 10 10 10 11 9 11 2 11 11 11 ...
## $ avg_hr_per_yr : num 26 45 15 13 36 5 28 19 31 42 ...
# Define vector containing numerical columns: cols
cols <- c(2:5)
# Use sapply to coerce cols to numeric
att5[, cols] <- sapply(att5[,cols], as.numeric)
str(att5)
## 'data.frame': 52 obs. of 5 variables:
## $ state : chr "United States" "Alabama" "Alaska" "Arizona" ...
## $ avg_attend_pct: num 22 28 8 6 14 23 29 5 7 11 ...
## $ avg_hr_per_day: num 7 11 6 5 10 3 11 6 8 10 ...
## $ avg_day_per_yr: num 10 10 10 11 9 11 2 11 11 11 ...
## $ avg_hr_per_yr : num 26 45 15 13 36 5 28 19 31 42 ...