── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# A tibble: 904 × 10
Year Month Day Federal.Funds.Target.Rate Federal.Funds.Upper.Target
<int> <int> <int> <dbl> <dbl>
1 1954 7 1 NA NA
2 1954 8 1 NA NA
3 1954 9 1 NA NA
4 1954 10 1 NA NA
5 1954 11 1 NA NA
6 1954 12 1 NA NA
7 1955 1 1 NA NA
8 1955 2 1 NA NA
9 1955 3 1 NA NA
10 1955 4 1 NA NA
Federal.Funds.Lower.Target Effective.Federal.Funds.Rate
<dbl> <dbl>
1 NA 0.8
2 NA 1.22
3 NA 1.06
4 NA 0.85
5 NA 0.83
6 NA 1.28
7 NA 1.39
8 NA 1.29
9 NA 1.35
10 NA 1.43
Real.GDP..Percent.Change. Unemployment.Rate Inflation.Rate
<dbl> <dbl> <dbl>
1 4.6 5.8 NA
2 NA 6 NA
3 NA 6.1 NA
4 8 5.7 NA
5 NA 5.3 NA
6 NA 5 NA
7 11.9 4.9 NA
8 NA 4.7 NA
9 NA 4.6 NA
10 6.7 4.7 NA
# ℹ 894 more rows
# A tibble: 119,390 × 32
hotel is_canceled lead_time arrival_date_year arrival_date_month
<chr> <int> <int> <int> <chr>
1 Resort Hotel 0 342 2015 July
2 Resort Hotel 0 737 2015 July
3 Resort Hotel 0 7 2015 July
4 Resort Hotel 0 13 2015 July
5 Resort Hotel 0 14 2015 July
6 Resort Hotel 0 14 2015 July
7 Resort Hotel 0 0 2015 July
8 Resort Hotel 0 9 2015 July
9 Resort Hotel 1 85 2015 July
10 Resort Hotel 1 75 2015 July
arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights
<int> <int> <int>
1 27 1 0
2 27 1 0
3 27 1 0
4 27 1 0
5 27 1 0
6 27 1 0
7 27 1 0
8 27 1 0
9 27 1 0
10 27 1 0
stays_in_week_nights adults children babies meal country market_segment
<int> <int> <int> <int> <chr> <chr> <chr>
1 0 2 0 0 BB PRT Direct
2 0 2 0 0 BB PRT Direct
3 1 1 0 0 BB GBR Direct
4 1 1 0 0 BB GBR Corporate
5 2 2 0 0 BB GBR Online TA
6 2 2 0 0 BB GBR Online TA
7 2 2 0 0 BB PRT Direct
8 2 2 0 0 FB PRT Direct
9 3 2 0 0 BB PRT Online TA
10 3 2 0 0 HB PRT Offline TA/TO
distribution_channel is_repeated_guest previous_cancellations
<chr> <int> <int>
1 Direct 0 0
2 Direct 0 0
3 Direct 0 0
4 Corporate 0 0
5 TA/TO 0 0
6 TA/TO 0 0
7 Direct 0 0
8 Direct 0 0
9 TA/TO 0 0
10 TA/TO 0 0
previous_bookings_not_canceled reserved_room_type assigned_room_type
<int> <chr> <chr>
1 0 C C
2 0 C C
3 0 A C
4 0 A A
5 0 A A
6 0 A A
7 0 C C
8 0 C C
9 0 A A
10 0 D D
booking_changes deposit_type agent company days_in_waiting_list customer_type
<int> <chr> <chr> <chr> <int> <chr>
1 3 No Deposit NULL NULL 0 Transient
2 4 No Deposit NULL NULL 0 Transient
3 0 No Deposit NULL NULL 0 Transient
4 0 No Deposit 304 NULL 0 Transient
5 0 No Deposit 240 NULL 0 Transient
6 0 No Deposit 240 NULL 0 Transient
7 0 No Deposit NULL NULL 0 Transient
8 0 No Deposit 303 NULL 0 Transient
9 0 No Deposit 240 NULL 0 Transient
10 0 No Deposit 15 NULL 0 Transient
adr required_car_parking_spaces total_of_special_requests
<dbl> <int> <int>
1 0 0 0
2 0 0 0
3 75 0 0
4 75 0 0
5 98 0 1
6 98 0 1
7 107 0 0
8 103 0 1
9 82 0 1
10 106. 0 0
reservation_status reservation_status_date
<chr> <chr>
1 Check-Out 2015-07-01
2 Check-Out 2015-07-01
3 Check-Out 2015-07-02
4 Check-Out 2015-07-02
5 Check-Out 2015-07-03
6 Check-Out 2015-07-03
7 Check-Out 2015-07-03
8 Check-Out 2015-07-03
9 Canceled 2015-05-06
10 Canceled 2015-04-22
# ℹ 119,380 more rows
3. clean the data
3.1 “poultry” data set
Arrange the “product” column through the alphabet order, which makes the table cleaner and easier to analyze through different categories in the “product” column.
# A tibble: 600 × 4
Product Year Month Price_Dollar
<chr> <dbl> <chr> <dbl>
1 B/S Breast 2013 January 7.04
2 B/S Breast 2013 February 7.04
3 B/S Breast 2013 March 7.04
4 B/S Breast 2013 April 7.04
5 B/S Breast 2013 May 7.04
6 B/S Breast 2013 June 7.04
7 B/S Breast 2013 July 7.04
8 B/S Breast 2013 August 7.04
9 B/S Breast 2013 September 7.04
10 B/S Breast 2013 October 7.04
# ℹ 590 more rows
3.2 “organiceggpoultry” data set
3.2.1 “whole data” sheet
Cleaning the “whole data” sheet through 1) separating the “Year and Month” column and putting the information into “month” and “year” two columns, and 2) cleaning the value “too few” in the “USDA Certified Organic Young Chicken_Bone in Breast” and “USDA Certified Organic Young Chicken_Thighs” columns and change the variables in the columns into numerical variables.
organiceggpoultry_whole_data_clean <- organiceggpoultry_whole_data %>%mutate(`Year and Month`=str_remove( `Year and Month`, " /1")) %>%separate(`Year and Month`, into=c("month", "year"), sep=" ") %>%fill(year)%>%mutate(`USDA Certified Organic Young Chicken_Bone in Breast`=str_replace(`USDA Certified Organic Young Chicken_Bone in Breast`,"too few","N/A"),`USDA Certified Organic Young Chicken_Bone in Breast`=as.numeric(`USDA Certified Organic Young Chicken_Bone in Breast`))%>%mutate(`USDA Certified Organic Young Chicken_Thighs`=str_replace(`USDA Certified Organic Young Chicken_Thighs`,"too few","N/A"),`USDA Certified Organic Young Chicken_Thighs`=as.numeric(`USDA Certified Organic Young Chicken_Thighs`))tibble(organiceggpoultry_whole_data_clean)%>%print(n =10, width =Inf)
# A tibble: 120 × 11
month year `USDA Certified Organic Eggs_Extra Large Dozen`
<chr> <chr> <dbl>
1 Jan 2004 230
2 February 2004 230
3 March 2004 230
4 April 2004 234.
5 May 2004 236
6 June 2004 241
7 July 2004 241
8 August 2004 241
9 September 2004 241
10 October 2004 241
`USDA Certified Organic Eggs_Extra Large 1/2 Doz`
<dbl>
1 132
2 134.
3 137
4 137
5 137
6 137
7 137
8 137
9 136.
10 136.
`USDA Certified Organic Eggs_Large Dozen`
<dbl>
1 230
2 226.
3 225
4 225
5 225
6 231.
7 234.
8 234.
9 234.
10 234.
`USDA Certified Organic Eggs_Large 1/2 Doz`
<dbl>
1 126
2 128.
3 131
4 131
5 131
6 134.
7 134.
8 134.
9 130.
10 128.
`USDA Certified Organic Young Chicken_Whole`
<dbl>
1 198.
2 198.
3 209
4 212
5 214.
6 216.
7 217
8 217
9 217
10 217
`USDA Certified Organic Young Chicken_B/S Breast`
<dbl>
1 646.
2 642.
3 642.
4 642.
5 642.
6 641
7 642.
8 642.
9 642.
10 642.
`USDA Certified Organic Young Chicken_Bone in Breast`
<dbl>
1 NA
2 NA
3 NA
4 NA
5 NA
6 NA
7 390.
8 390.
9 390.
10 390.
`USDA Certified Organic Young Chicken_Whole Legs`
<dbl>
1 194.
2 194.
3 194.
4 194.
5 194.
6 202.
7 204.
8 204.
9 204.
10 204.
`USDA Certified Organic Young Chicken_Thighs`
<dbl>
1 NA
2 203
3 203
4 203
5 203
6 200.
7 200.
8 200.
9 200.
10 200.
# ℹ 110 more rows
3.2.2 “Organic egg prices” sheet
Cleaning the “Organic egg prices” sheet by cleaning the titles and empty cells.
# A tibble: 43 × 15
Dozen Jan. Feb. Mar. Apr. May June July Aug. Sep.
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Whole 238.5 238.5 238.5 238.5 238.5 238.5 238.5 238.5 238.5
2 B/S Breast 703.75 703.75 703.75 703.75 703.75 703.75 703.75 703.75 703.75
3 Bone-in Breast 390.5 390.5 390.5 390.5 390.5 390.5 390.5 390.5 390.5
4 Whole Legs 203.5 203.5 203.5 203.5 203.5 203.5 203.5 203.5 203.5
5 Thighs 216.25 216.25 216.25 216.25 216.25 216.25 216.25 216.25 216.25
6 Whole 235 238.5 238.5 238.5 238.5 238.5 238.5 238.5 238.5
7 B/S Breast 637.5 700 700 700 700 700 700 700 700
8 Bone-in Breast 390.5 390.5 390.5 390.5 390.5 390.5 390.5 390.5 390.5
9 Whole Legs 203.5 203.5 203.5 203.5 203.5 203.5 203.5 203.5 203.5
10 Thighs 215 215 215 215 215 215 215 216.25 216.25
Oct. Nov. Dec. year Type
<chr> <chr> <chr> <chr> <chr>
1 238.5 238.5 238.5 2013 Organic young chicken
2 703.75 703.75 703.75 2013 Organic young chicken
3 390.5 390.5 390.5 2013 Organic young chicken
4 203.5 203.5 203.5 2013 Organic young chicken
5 216.25 216.25 216.25 2013 Organic young chicken
6 238.5 238.5 238.5 2012 Organic young chicken
7 700 703.75 703.75 2012 Organic young chicken
8 390.5 390.5 390.5 2012 Organic young chicken
9 203.5 203.5 203.5 2012 Organic young chicken
10 216.25 216.25 216.25 2012 Organic young chicken
# ℹ 33 more rows
3.3 “FedFundsRate” data set
Cleaning the “FedFundsRate” data set by 1) cleaning the “month” column with the “month. name” function to make values in the column easier to understand and 2) pulling out the nulls with no values.
# A tibble: 904 × 10
Year Month Day Federal.Funds.Target.Rate Federal.Funds.Upper.Target
<int> <chr> <int> <dbl> <dbl>
1 1954 July 1 NA NA
2 1954 August 1 NA NA
3 1954 September 1 NA NA
4 1954 October 1 NA NA
5 1954 November 1 NA NA
6 1954 December 1 NA NA
7 1955 January 1 NA NA
8 1955 February 1 NA NA
9 1955 March 1 NA NA
10 1955 April 1 NA NA
Federal.Funds.Lower.Target Effective.Federal.Funds.Rate
<dbl> <dbl>
1 NA 0.8
2 NA 1.22
3 NA 1.06
4 NA 0.85
5 NA 0.83
6 NA 1.28
7 NA 1.39
8 NA 1.29
9 NA 1.35
10 NA 1.43
Real.GDP..Percent.Change. Unemployment.Rate Inflation.Rate
<dbl> <dbl> <dbl>
1 4.6 5.8 NA
2 NA 6 NA
3 NA 6.1 NA
4 8 5.7 NA
5 NA 5.3 NA
6 NA 5 NA
7 11.9 4.9 NA
8 NA 4.7 NA
9 NA 4.6 NA
10 6.7 4.7 NA
# ℹ 894 more rows
# A tibble: 10 × 2
name value
<chr> <int>
1 Federal.Funds.Upper.Target 801
2 Federal.Funds.Lower.Target 801
3 Real.GDP..Percent.Change. 654
4 Federal.Funds.Target.Rate 442
5 Inflation.Rate 194
6 Effective.Federal.Funds.Rate 152
7 Unemployment.Rate 152
8 Year 0
9 Month 0
10 Day 0
3.4 “hotel_bookings” data set
Cleaning the “hotel_bookings” data set by mutating “arrival_date_year,” “arrival_date_month,” and “arrival_date_day_of_month” into the “arrival_date” column.
# A tibble: 120 × 7
Year Month `B/S Breast` `Bone-in Breast` Thighs Whole `Whole Legs`
<dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2013 January 7.04 3.90 2.16 2.38 2.04
2 2013 February 7.04 3.90 2.16 2.38 2.04
3 2013 March 7.04 3.90 2.16 2.38 2.04
4 2013 April 7.04 3.90 2.16 2.38 2.04
5 2013 May 7.04 3.90 2.16 2.38 2.04
6 2013 June 7.04 3.90 2.16 2.38 2.04
7 2013 July 7.04 3.90 2.16 2.38 2.04
8 2013 August 7.04 3.90 2.16 2.38 2.04
9 2013 September 7.04 3.90 2.16 2.38 2.04
10 2013 October 7.04 3.90 2.16 2.38 2.04
# ℹ 110 more rows
4.2 “organiceggpoultry” data set
4.2.1 “whole data” sheet
Pivoting the “whole data” sheet by putting the titles from “USDA Certified Organic Eggs_Extra Large Dozen” to “USDA Certified Organic Young Chicken_Thighs_clean” into the “element&dozen”column.
organiceggpoultry_whole_data_pivot <-pivot_longer(organiceggpoultry_whole_data_clean, cols =c("USDA Certified Organic Eggs_Extra Large Dozen", "USDA Certified Organic Eggs_Extra Large 1/2 Doz", "USDA Certified Organic Eggs_Large Dozen", "USDA Certified Organic Eggs_Large 1/2 Doz", "USDA Certified Organic Young Chicken_Whole", "USDA Certified Organic Young Chicken_B/S Breast", "USDA Certified Organic Young Chicken_Bone in Breast","USDA Certified Organic Young Chicken_Whole Legs", "USDA Certified Organic Young Chicken_Thighs"), names_to ="element&dozen", values_to ="value")tibble(organiceggpoultry_whole_data_pivot)%>%print(n =10, width =Inf)
# A tibble: 1,080 × 4
month year `element&dozen` value
<chr> <chr> <chr> <dbl>
1 Jan 2004 USDA Certified Organic Eggs_Extra Large Dozen 230
2 Jan 2004 USDA Certified Organic Eggs_Extra Large 1/2 Doz 132
3 Jan 2004 USDA Certified Organic Eggs_Large Dozen 230
4 Jan 2004 USDA Certified Organic Eggs_Large 1/2 Doz 126
5 Jan 2004 USDA Certified Organic Young Chicken_Whole 198.
6 Jan 2004 USDA Certified Organic Young Chicken_B/S Breast 646.
7 Jan 2004 USDA Certified Organic Young Chicken_Bone in Breast NA
8 Jan 2004 USDA Certified Organic Young Chicken_Whole Legs 194.
9 Jan 2004 USDA Certified Organic Young Chicken_Thighs NA
10 February 2004 USDA Certified Organic Eggs_Extra Large Dozen 230
# ℹ 1,070 more rows
4.2.2 “Organic egg prices” sheet
Pivoting the “Organic egg prices” sheet by putting the titles from “Jan.” to “Dec.” into the “months” column.
# A tibble: 6,328 × 5
Year Month Day Rate_Types value
<int> <chr> <int> <chr> <dbl>
1 1954 July 1 Federal.Funds.Target.Rate NA
2 1954 July 1 Federal.Funds.Upper.Target NA
3 1954 July 1 Federal.Funds.Lower.Target NA
4 1954 July 1 Effective.Federal.Funds.Rate 0.8
5 1954 July 1 Real.GDP..Percent.Change. 4.6
6 1954 July 1 Unemployment.Rate 5.8
7 1954 July 1 Inflation.Rate NA
8 1954 August 1 Federal.Funds.Target.Rate NA
9 1954 August 1 Federal.Funds.Upper.Target NA
10 1954 August 1 Federal.Funds.Lower.Target NA
# ℹ 6,318 more rows
4.4 “hotel_bookings” data set
As this is a wide data set containing information, keeping the data set after cleaning is a better choice than doing further pivoting by using the “pivot_longer” and “pivot_wider” functions.
5. Data analysis
Analysis of the data sets by calculating the mean, median, standard deviation, mode, and quantile.
`summarise()` has grouped output by 'Year', 'Rate_Types'. You can override
using the `.groups` argument.
FedFundsRate_summary%>%print(n =10, width =Inf)
# A tibble: 700 × 11
Year Rate_Types FedFundsRate_mean_number
<int> <chr> <dbl>
1 1954 Effective.Federal.Funds.Rate 1.01
2 1954 Effective.Federal.Funds.Rate 1.01
3 1954 Effective.Federal.Funds.Rate 1.01
4 1954 Effective.Federal.Funds.Rate 1.01
5 1954 Effective.Federal.Funds.Rate 1.01
6 1954 Effective.Federal.Funds.Rate 1.01
7 1954 Federal.Funds.Lower.Target NaN
8 1954 Federal.Funds.Target.Rate NaN
9 1954 Federal.Funds.Upper.Target NaN
10 1954 Inflation.Rate NaN
` FedFundsRate_median_number` ` FedFundsRate_sd_number`
<dbl> <dbl>
1 0.955 0.210
2 0.955 0.210
3 0.955 0.210
4 0.955 0.210
5 0.955 0.210
6 0.955 0.210
7 NA NA
8 NA NA
9 NA NA
10 NA NA
` FedFundsRate_max_number` ` FedFundsRate_min_number`
<dbl> <dbl>
1 1.28 0.8
2 1.28 0.8
3 1.28 0.8
4 1.28 0.8
5 1.28 0.8
6 1.28 0.8
7 -Inf Inf
8 -Inf Inf
9 -Inf Inf
10 -Inf Inf
` FedFundsRate_mode_number` ` FedFundsRate_quant25` ` FedFundsRate_quant50`
<dbl> <dbl> <dbl>
1 0.8 0.835 0.955
2 1.22 0.835 0.955
3 1.06 0.835 0.955
4 0.85 0.835 0.955
5 0.83 0.835 0.955
6 1.28 0.835 0.955
7 NA NA NA
8 NA NA NA
9 NA NA NA
10 NA NA NA
` FedFundsRate_quant75`
<dbl>
1 1.18
2 1.18
3 1.18
4 1.18
5 1.18
6 1.18
7 NA
8 NA
9 NA
10 NA
# ℹ 690 more rows
5.4 “hotel_bookings” data set
# 1. Analysis of the "lead_time", which is the time used from the start of the process till the end for two different hotels. find_mode <-function(x) { u <-unique(x) tab <-tabulate(match(x, u)) u[tab ==max(tab)]}q =c(.25, .5, .75) lead_time_summary <- hotel_bookings_clean %>%group_by(hotel)%>%summarise(mean_number=mean(lead_time, na.rm =TRUE), median_number =median(lead_time, na.rm =TRUE), sd_number =sd(lead_time,na.rm =TRUE), max_number=max(lead_time, na.rm =TRUE), min_number=min(lead_time, na.rm =TRUE), mode_number=find_mode(lead_time), quant25 =quantile(lead_time, na.rm = T, probs = q[1]),quant50 =quantile(lead_time, na.rm = T, probs = q[2]), quant75 =quantile(lead_time, na.rm = T, probs = q[3]))%>%ungroup()lead_time_summary%>%print(n =10, width =Inf)