Part I

Using one or more TidyVerse packages, and any dataset from fivethirtyeight.com or Kaggle, create a programming sample “vignette” that demonstrates how to use one or more of the capabilities of the selected TidyVerse package with your selected dataset. (25 points)

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages ------------------------------------------------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.1.1       v purrr   0.3.1  
## v tibble  2.0.1       v dplyr   0.8.0.1
## v tidyr   0.8.3       v stringr 1.4.0  
## v readr   1.3.1       v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts --------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(ggplot2)

Load the data from Kaggle

avocado_price <- read.csv("https://raw.githubusercontent.com/Zchen116/tidyverse/master/avocado.csv")
head(avocado_price)
summary(avocado_price)
##        X                 Date        AveragePrice    Total.Volume     
##  Min.   : 0.00   2015-01-04:  108   Min.   :0.440   Min.   :      85  
##  1st Qu.:10.00   2015-01-11:  108   1st Qu.:1.100   1st Qu.:   10839  
##  Median :24.00   2015-01-18:  108   Median :1.370   Median :  107377  
##  Mean   :24.23   2015-01-25:  108   Mean   :1.406   Mean   :  850644  
##  3rd Qu.:38.00   2015-02-01:  108   3rd Qu.:1.660   3rd Qu.:  432962  
##  Max.   :52.00   2015-02-08:  108   Max.   :3.250   Max.   :62505647  
##                  (Other)   :17601                                     
##      X4046              X4225              X4770        
##  Min.   :       0   Min.   :       0   Min.   :      0  
##  1st Qu.:     854   1st Qu.:    3009   1st Qu.:      0  
##  Median :    8645   Median :   29061   Median :    185  
##  Mean   :  293008   Mean   :  295155   Mean   :  22840  
##  3rd Qu.:  111020   3rd Qu.:  150207   3rd Qu.:   6243  
##  Max.   :22743616   Max.   :20470573   Max.   :2546439  
##                                                         
##    Total.Bags         Small.Bags         Large.Bags     
##  Min.   :       0   Min.   :       0   Min.   :      0  
##  1st Qu.:    5089   1st Qu.:    2849   1st Qu.:    127  
##  Median :   39744   Median :   26363   Median :   2648  
##  Mean   :  239639   Mean   :  182195   Mean   :  54338  
##  3rd Qu.:  110783   3rd Qu.:   83338   3rd Qu.:  22029  
##  Max.   :19373134   Max.   :13384587   Max.   :5719097  
##                                                         
##   XLarge.Bags                 type           year     
##  Min.   :     0.0   conventional:9126   Min.   :2015  
##  1st Qu.:     0.0   organic     :9123   1st Qu.:2015  
##  Median :     0.0                       Median :2016  
##  Mean   :  3106.4                       Mean   :2016  
##  3rd Qu.:   132.5                       3rd Qu.:2017  
##  Max.   :551693.7                       Max.   :2018  
##                                                       
##                  region     
##  Albany             :  338  
##  Atlanta            :  338  
##  BaltimoreWashington:  338  
##  Boise              :  338  
##  Boston             :  338  
##  BuffaloRochester   :  338  
##  (Other)            :16221

Use filter function to filter Boston’s avocado price from the dataset

head(filter(avocado_price,region== "Boston"))

Use arrange function to arrange the Average Price from the highest

head(avocado_price%>%arrange(desc(AveragePrice)))

Use arrange function to arrange the Average Price from the lowest

head(avocado_price%>%arrange(AveragePrice))

Use select function to choose data of region, year, type, and Average Price to create another data frame

head(avocado_price%>%select(region, year,type,AveragePrice))

Use rename function to rename column

head(rename(avocado_price, city = region))

Use summarise function to a data frame about minium, mean, and maxium of Average Price

filter(avocado_price,region == "Boston")%>%summarise(AveragePrice_min=min(AveragePrice), AveragePrice_mean=mean(AveragePrice),AveragePrice_max=max(AveragePrice))

Use group_by function

by_region <- group_by(avocado_price, region)
sui <- summarise(by_region,
  count = n(),
  AveragePrice_mean = mean(AveragePrice, na.rm = TRUE))
head(sui %>% arrange(desc(AveragePrice_mean)))

Use mutate function

my_data<-head(sui %>% arrange(desc(AveragePrice_mean)) %>%  mutate(AveragePrice_mean = round(AveragePrice_mean, 2)),20)
my_data

Use ggplot2 function to create graph

my_data%>%ggplot(aes(x=region, y=AveragePrice_mean, fill=region))+
  geom_bar(stat = "identity", position = "dodge") + 
  guides(fill = FALSE) +
  ggtitle("Average Price mean") +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

#Part 2 #Extend an Existing Example. Using one of your classmate’s examples (as created above), extend his or her example with additional annotated code. (15 points) #1, Get dataset from 538 (existing code)

weather <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/us-weather-history/KCLT.csv")
summary(weather)
##          date     actual_mean_temp actual_min_temp actual_max_temp 
##  2014-10-1 :  1   Min.   :18.00    Min.   : 7.00   Min.   : 26.00  
##  2014-10-10:  1   1st Qu.:47.00    1st Qu.:37.00   1st Qu.: 58.00  
##  2014-10-11:  1   Median :63.00    Median :52.00   Median : 73.00  
##  2014-10-12:  1   Mean   :61.05    Mean   :49.96   Mean   : 71.63  
##  2014-10-13:  1   3rd Qu.:75.00    3rd Qu.:65.00   3rd Qu.: 86.00  
##  2014-10-14:  1   Max.   :88.00    Max.   :75.00   Max.   :100.00  
##  (Other)   :359                                                    
##  average_min_temp average_max_temp record_min_temp record_max_temp 
##  Min.   :29.00    Min.   :50.00    Min.   :-5.00   Min.   : 69.00  
##  1st Qu.:36.00    1st Qu.:58.00    1st Qu.:15.00   1st Qu.: 79.00  
##  Median :48.00    Median :72.00    Median :30.00   Median : 90.00  
##  Mean   :48.82    Mean   :70.98    Mean   :31.47   Mean   : 88.73  
##  3rd Qu.:63.00    3rd Qu.:84.00    3rd Qu.:49.00   3rd Qu.: 98.00  
##  Max.   :68.00    Max.   :89.00    Max.   :62.00   Max.   :104.00  
##                                                                    
##  record_min_temp_year record_max_temp_year actual_precipitation
##  Min.   :1879         Min.   :1879         Min.   :0.0000      
##  1st Qu.:1918         1st Qu.:1931         1st Qu.:0.0000      
##  Median :1963         Median :1953         Median :0.0000      
##  Mean   :1953         Mean   :1954         Mean   :0.1024      
##  3rd Qu.:1983         3rd Qu.:1984         3rd Qu.:0.0300      
##  Max.   :2015         Max.   :2015         Max.   :2.6500      
##                                                                
##  average_precipitation record_precipitation
##  Min.   :0.0900        Min.   :0.850       
##  1st Qu.:0.1000        1st Qu.:1.650       
##  Median :0.1100        Median :1.980       
##  Mean   :0.1141        Mean   :2.209       
##  3rd Qu.:0.1200        3rd Qu.:2.540       
##  Max.   :0.1500        Max.   :6.880       
## 

2, Tidyr separate function (existing code):

library(tidyr)
weather2 <- weather %>% separate(date, c("year", "month", "day"), sep = "-")
summary(weather2)
##      year              month               day            actual_mean_temp
##  Length:365         Length:365         Length:365         Min.   :18.00   
##  Class :character   Class :character   Class :character   1st Qu.:47.00   
##  Mode  :character   Mode  :character   Mode  :character   Median :63.00   
##                                                           Mean   :61.05   
##                                                           3rd Qu.:75.00   
##                                                           Max.   :88.00   
##  actual_min_temp actual_max_temp  average_min_temp average_max_temp
##  Min.   : 7.00   Min.   : 26.00   Min.   :29.00    Min.   :50.00   
##  1st Qu.:37.00   1st Qu.: 58.00   1st Qu.:36.00    1st Qu.:58.00   
##  Median :52.00   Median : 73.00   Median :48.00    Median :72.00   
##  Mean   :49.96   Mean   : 71.63   Mean   :48.82    Mean   :70.98   
##  3rd Qu.:65.00   3rd Qu.: 86.00   3rd Qu.:63.00    3rd Qu.:84.00   
##  Max.   :75.00   Max.   :100.00   Max.   :68.00    Max.   :89.00   
##  record_min_temp record_max_temp  record_min_temp_year
##  Min.   :-5.00   Min.   : 69.00   Min.   :1879        
##  1st Qu.:15.00   1st Qu.: 79.00   1st Qu.:1918        
##  Median :30.00   Median : 90.00   Median :1963        
##  Mean   :31.47   Mean   : 88.73   Mean   :1953        
##  3rd Qu.:49.00   3rd Qu.: 98.00   3rd Qu.:1983        
##  Max.   :62.00   Max.   :104.00   Max.   :2015        
##  record_max_temp_year actual_precipitation average_precipitation
##  Min.   :1879         Min.   :0.0000       Min.   :0.0900       
##  1st Qu.:1931         1st Qu.:0.0000       1st Qu.:0.1000       
##  Median :1953         Median :0.0000       Median :0.1100       
##  Mean   :1954         Mean   :0.1024       Mean   :0.1141       
##  3rd Qu.:1984         3rd Qu.:0.0300       3rd Qu.:0.1200       
##  Max.   :2015         Max.   :2.6500       Max.   :0.1500       
##  record_precipitation
##  Min.   :0.850       
##  1st Qu.:1.650       
##  Median :1.980       
##  Mean   :2.209       
##  3rd Qu.:2.540       
##  Max.   :6.880

3, Dplyr select function (existing code):

head(select(weather2, year, actual_mean_temp, record_min_temp, record_max_temp, record_precipitation))

4, Dplyr filter (subsetting dataset) (existing code):

head(filter(weather2, year == "2014"))

Extend Part:

Use select and filter funtion

weather3 <- weather2 %>%select(year, actual_mean_temp, actual_min_temp, actual_max_temp, actual_precipitation)
head(filter(weather3, year == "2014"))

Use summarise function to a data frame about minium, mean, and maxium

filter(weather3, year == "2014")%>%summarise(min=min(actual_min_temp), mean=mean(actual_mean_temp),max=max(actual_max_temp))