Challenge 5

Author

Jingyi Yang

1. Set Up

knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(readr)
library(readxl)

2. Import the data

2.1 Impprt the “NYC_Airbnb” data set

setwd("C:\\8-601\\challenge_datasets")
NYC_Airbnb<-read.csv("AB_NYC_2019.csv")
tibble(NYC_Airbnb)%>% print(n = 10, width = Inf)

# A tibble: 48,895 × 16
      id name                                               host_id host_name  
   <int> <chr>                                                <int> <chr>      
 1  2539 "Clean & quiet apt home by the park"                  2787 John       
 2  2595 "Skylit Midtown Castle"                               2845 Jennifer   
 3  3647 "THE VILLAGE OF HARLEM....NEW YORK !"                 4632 Elisabeth  
 4  3831 "Cozy Entire Floor of Brownstone"                     4869 LisaRoxanne
 5  5022 "Entire Apt: Spacious Studio/Loft by central park"    7192 Laura      
 6  5099 "Large Cozy 1 BR Apartment In Midtown East"           7322 Chris      
 7  5121 "BlissArtsSpace!"                                     7356 Garon      
 8  5178 "Large Furnished Room Near B'way "                    8967 Shunichi   
 9  5203 "Cozy Clean Guest Room - Family Apt"                  7490 MaryEllen  
10  5238 "Cute & Cozy Lower East Side 1 bdrm"                  7549 Ben        
   neighbourhood_group neighbourhood      latitude longitude room_type      
   <chr>               <chr>                 <dbl>     <dbl> <chr>          
 1 Brooklyn            Kensington             40.6     -74.0 Private room   
 2 Manhattan           Midtown                40.8     -74.0 Entire home/apt
 3 Manhattan           Harlem                 40.8     -73.9 Private room   
 4 Brooklyn            Clinton Hill           40.7     -74.0 Entire home/apt
 5 Manhattan           East Harlem            40.8     -73.9 Entire home/apt
 6 Manhattan           Murray Hill            40.7     -74.0 Entire home/apt
 7 Brooklyn            Bedford-Stuyvesant     40.7     -74.0 Private room   
 8 Manhattan           Hell's Kitchen         40.8     -74.0 Private room   
 9 Manhattan           Upper West Side        40.8     -74.0 Private room   
10 Manhattan           Chinatown              40.7     -74.0 Entire home/apt
   price minimum_nights number_of_reviews last_review  reviews_per_month
   <int>          <int>             <int> <chr>                    <dbl>
 1   149              1                 9 "2018-10-19"              0.21
 2   225              1                45 "2019-05-21"              0.38
 3   150              3                 0 ""                       NA   
 4    89              1               270 "2019-07-05"              4.64
 5    80             10                 9 "2018-11-19"              0.1 
 6   200              3                74 "2019-06-22"              0.59
 7    60             45                49 "2017-10-05"              0.4 
 8    79              2               430 "2019-06-24"              3.47
 9    79              2               118 "2017-07-21"              0.99
10   150              1               160 "2019-06-09"              1.33
   calculated_host_listings_count availability_365
                            <int>            <int>
 1                              6              365
 2                              2              355
 3                              1              365
 4                              1              194
 5                              1                0
 6                              1              129
 7                              1                0
 8                              1              220
 9                              1                0
10                              4              188
# ℹ 48,885 more rows

2.2 Import the “pathogens” data set

setwd("C:\\8-601\\challenge_datasets")
pathogens<-read_excel("Total_cost_for_top_15_pathogens_2018.xlsx")
pathogens%>% print(n = 10, width = Inf)

# A tibble: 27 × 3
   Total cost of foodborne illness estimates for 15 leading foodborne pathogen…¹
   <chr>                                                                        
 1 <NA>                                                                         
 2 <NA>                                                                         
 3 <NA>                                                                         
 4 <NA>                                                                         
 5 Campylobacter spp. (all species)                                             
 6 Clostridium perfringens                                                      
 7 Cryptosporidium spp. (all species)                                           
 8 Cyclospora cayetanensis                                                      
 9 Listeria monocytogenes                                                       
10 Norovirus                                                                    
   ...2                 ...3              
   <chr>                <chr>             
 1 <NA>                 <NA>              
 2 Mean estimates, 2018 <NA>              
 3 <NA>                 <NA>              
 4 Cases                Cost              
 5 845024               2181485783.4322653
 6 965958               384277855.5749535 
 7 57616                58394152.170415238
 8 11407.000000000002   2571517.8543861103
 9 1591                 3189686110.4762712
10 5461731              2566984191.1498566
# ℹ 17 more rows
# ℹ abbreviated name: ¹`Total cost of foodborne illness estimates for 15 leading foodborne pathogens`

3. Clean the data

3.1 Clean the “NYC_Airbnb” data set

Cleaning the data is also a time to understand the data. “id” and “name” all represent the information about the name of the room or house for Airbnb, and the “host_id” and “host_name” all represent the owner of the room or house. Accordingly, as the value in the “name” and “host_name” columns is not concise, deleting them and keeping their “id” column is a way to make the data set clean and easier to understand. As the ID represents the room and people, it should be treated as a categorical variable. “neighbourhood_group” and “neighbourhood” refers to the community in the Airbnb room and house located, “latitude” refers to the listing latitude of the Airbnb room and house, and “longitude” is the listing longitude of the Airbnb room and house. “room_type” provide information about different types of room, “price” is the listing price for the Airbnb room and house. “minimum_nights” means the minimum nights that required to stay. “number_of_reviews” is the total number of reviews online. “last_review” is the date of last review been posted. “reviews_per_month” refers to an average number of reviews per month. “calculated_host_listings_count” indicates the total number of room and house that the host own are listing. “availability_365” means the number of the day the room and house are available in the frame of one year (365 days). Here can find more information about the data set.

NYC_Airbnb_clean <- NYC_Airbnb %>% 
  select(- ("name"),
         - ("host_name"))%>% 
  mutate(id= as.character(id), host_id=as.character(host_id))%>% 
 mutate(last_review = as.Date(`last_review`))

tibble(NYC_Airbnb_clean)%>% print(n = 10, width = Inf)

# A tibble: 48,895 × 14
   id    host_id neighbourhood_group neighbourhood      latitude longitude
   <chr> <chr>   <chr>               <chr>                 <dbl>     <dbl>
 1 2539  2787    Brooklyn            Kensington             40.6     -74.0
 2 2595  2845    Manhattan           Midtown                40.8     -74.0
 3 3647  4632    Manhattan           Harlem                 40.8     -73.9
 4 3831  4869    Brooklyn            Clinton Hill           40.7     -74.0
 5 5022  7192    Manhattan           East Harlem            40.8     -73.9
 6 5099  7322    Manhattan           Murray Hill            40.7     -74.0
 7 5121  7356    Brooklyn            Bedford-Stuyvesant     40.7     -74.0
 8 5178  8967    Manhattan           Hell's Kitchen         40.8     -74.0
 9 5203  7490    Manhattan           Upper West Side        40.8     -74.0
10 5238  7549    Manhattan           Chinatown              40.7     -74.0
   room_type       price minimum_nights number_of_reviews last_review
   <chr>           <int>          <int>             <int> <date>     
 1 Private room      149              1                 9 2018-10-19 
 2 Entire home/apt   225              1                45 2019-05-21 
 3 Private room      150              3                 0 NA         
 4 Entire home/apt    89              1               270 2019-07-05 
 5 Entire home/apt    80             10                 9 2018-11-19 
 6 Entire home/apt   200              3                74 2019-06-22 
 7 Private room       60             45                49 2017-10-05 
 8 Private room       79              2               430 2019-06-24 
 9 Private room       79              2               118 2017-07-21 
10 Entire home/apt   150              1               160 2019-06-09 
   reviews_per_month calculated_host_listings_count availability_365
               <dbl>                          <int>            <int>
 1              0.21                              6              365
 2              0.38                              2              355
 3             NA                                 1              365
 4              4.64                              1              194
 5              0.1                               1                0
 6              0.59                              1              129
 7              0.4                               1                0
 8              3.47                              1              220
 9              0.99                              1                0
10              1.33                              4              188
# ℹ 48,885 more rows

3.2 clean the “pathogens” data set

The data set includes information about different species of pathogens, the case of the illness it case, and the cost of the illness. Cleaning this data set, which includes deleting the header, footnote, and other parts that do not contain the value, and creating a new column for the abbreviation for the different types of pathogens, is necessary as it will make the data set easier to analyze.

pathogens_clean <- pathogens %>% 
  na.omit()%>%
  rename("Species"="Total cost of foodborne illness estimates for 15 leading foodborne pathogens", "Cases"="...2", "Cost"="...3")%>%
  mutate(Species_abbr= abbreviate(Species,4, dot = "TRUE", strict = "TRUE"))%>%
  mutate(Cases= as.numeric(Cases), Cost= as.numeric(Cost))%>%
 slice(-16)
  
pathogens_clean%>% print(n = 10, width = Inf)

# A tibble: 15 × 4
   Species                                                           Cases
   <chr>                                                             <dbl>
 1 Campylobacter spp. (all species)                                 845024
 2 Clostridium perfringens                                          965958
 3 Cryptosporidium spp. (all species)                                57616
 4 Cyclospora cayetanensis                                           11407
 5 Listeria monocytogenes                                             1591
 6 Norovirus                                                       5461731
 7 Salmonella (non-typhoidal species)                              1027561
 8 Shigella (all species)                                           131254
 9 Shiga toxin-producing Escherichia coli O157 (STEC O157)           63153
10 non-O157 Shiga toxin-producing Escherichia coli (STEC non-O157)  112752
          Cost Species_abbr
         <dbl> <chr>       
 1 2181485783. Cs(s.       
 2  384277856. Clsp.       
 3   58394152. Cs(s.       
 4    2571518. Cycc.       
 5 3189686110. Lstm.       
 6 2566984191. Nrvr.       
 7 4142179161. S(-s.       
 8  159202402. S(s).       
 9  311036907. StEcO(O.    
10   31701852. nStEc(n.    
# ℹ 5 more rows

4.Univariate Visualizations

4.1 “NYC_Airbnb” data set

# Analysis the "Room Type"

##| Choosing the "bar" function because 1) the variable is a categorical variable and 2) it can clearly display the frequency. During the visualization, some arguments like "fill=" and "labs()" make the graphic more straightforward to understand. 

ggplot(`NYC_Airbnb_clean`,aes(`room_type`, fill=room_type))+
 geom_bar()+
  scale_fill_discrete(name="Room Type")+
  labs(x= "Room Type", y= "Count", title = "Room Type Frequency")

#Analysis the "Price"

##| Choosing the "histogram" function because 1) the variable is numerical, and 2) it can clearly display the frequency as it can automatically count the number of data points per bin. Some arguments like " coord_cartesian()" and "labs()," which can customize the graphic, can make it more straightforward to understand. 

ggplot(`NYC_Airbnb_clean`,aes(`price`),position = "dodge")+
  geom_histogram()+
  coord_cartesian(ylim =c(0, 13000))+
  scale_x_continuous(limits=range(NYC_Airbnb_clean$price),n.breaks= 15)+
  labs(x =" Price", y= "Count", title = "Price Frequency")

#Analysis the "Availability"

##| Choosing the "histogram" function because 1) the variable is numerical, and 2) it can clearly display the frequency as it can automatically count the number of data points per bin.

ggplot(`NYC_Airbnb_clean`,aes(`availability_365`))+
  geom_histogram()+
  scale_x_continuous(limits=range(NYC_Airbnb_clean$availability_365),n.breaks= 10)+
  coord_cartesian(ylim =c(0, 2000))+
  labs(x =" Availability",y= "Count", title = "Availability Frequency")

#Analysis the "Reviews Per Month"

##| Choosing the "histogram" function because 1) the variable is numerical, and 2) it can clearly display the frequency as it can automatically count the number of data points per bin.

ggplot(`NYC_Airbnb_clean`,aes(`reviews_per_month`, na.rm=TRUE))+
  geom_histogram()+
  scale_x_continuous(limits=range(NYC_Airbnb_clean$reviews_per_month),n.breaks= 10)+
  labs(x ="Reviews Per Month",y= "Count", title = "Reviews Per Month Frequency")

#Analysis the "Last Review Date"

##| Choosing the "freqpoly" (frequency polygons) and histogram function because 1) the variable is about the date, which is closer to a quantity variable, and 2) the frequency polygons have the name ["histograms with lines."](https://dcl-data-vis.stanford.edu/distributions.html) 

ggplot(NYC_Airbnb_clean, aes(last_review, na.rm=TRUE)) +
  geom_freqpoly(color="red")+
  geom_histogram()+
  scale_x_date(limits=range(NYC_Airbnb_clean$last_review),breaks="1 years",date_labels = "%Y" )+
  labs(x ="Last Reviews Date",y= "Count", title = " Last Reviews Date Frequency")

4.2 “pathogens” data set

# Analysis the "Cost"

##| Choosing the "histogram" function because 1) the variable is numerical, and 2) it can clearly display the frequency as it can automatically count the number of data points per bin.

ggplot(pathogens_clean, aes(Cost))+
  geom_histogram()+
  scale_x_continuous(labels=scales::label_currency(suffix="B",scale=1e-9),n.breaks = 10)+
   labs(y= "Count", title = "Cost Frequency")

# Analysis the "Cases"

##| Choosing the "histogram" function because 1) the variable is numerical, and 2) it can clearly display the frequency as it can automatically count the number of data points per bin.

ggplot(pathogens_clean, aes(Cases))+
  geom_histogram()+
  scale_x_continuous(labels= scales::label_number(suffix="M", scale=1e-6),n.breaks = 10)+
   labs(y= "Count", title = "Cases Frequency")

5. Bivariate Visualizations

5.1 “NYC_Airbnb” data set

# The relationship between "price" and "Number Of Reviews"

##| Choosing the "point" and "smooth" is because 1) both variables are numerical and 2) the combination of two functions can make the tendency clear. Using the "facet_warp" function can include a categorical variable in the analysis. 

ggplot(`NYC_Airbnb_clean`, aes(x=price,y=number_of_reviews, color=room_type))+
  geom_point()+
  geom_smooth()+
  facet_wrap(~ room_type, nrow = 3)+
  scale_color_discrete(name="Room Type")+
  labs(x= "Price", y= "Number Of Reviews", title = "'Price' and 'Number Of Reviews'")

# Analysis the "Neighbourhood Group"

##| Choosing the "geom_col" is because 1) both variables are categorical and 2) containing the "position = 'dodge'" argument can make the graphic more easy to understand. 

NYC_Airbnb_counts <- NYC_Airbnb_clean %>%
  group_by(neighbourhood_group,room_type) %>%
  summarise(n=n()) %>%
  ungroup()

ggplot(NYC_Airbnb_counts,aes(x=neighbourhood_group,y=n,fill= room_type))+
  geom_col(position = "dodge")+
  scale_fill_discrete(name="Room Type")+
  labs(x= "Neighbourhood Group", y= "Number", title = "Neighbourhood Group Frequency")

# Analysis the relationship "Room Type" and "Price"

##| Choosing the "jitter" because two variables include a categorical variable and a numerical variable. It can present information regarding the numerical variable based on the categorical variable. Besides, using "point" and "box plot" might make some values in the "price" column overlap, which is not beneficial for understanding the distribution of the numerical value, and the "jitter" function can solve this problem.

ggplot(data =NYC_Airbnb_clean, mapping = aes(x = room_type, y = price)) +
  geom_jitter(aes(color=room_type))+
   scale_y_continuous(limits=range(NYC_Airbnb_clean$price),n.breaks= 15)+
  scale_color_discrete(name="Room Type")+
labs(x= "Room Type", y= "Price", title = "'Room Type' and 'Price'")

# Analysis the relationship "Neighbourhood Group" and "Price"

##| Choosing the "jitter" because two variables include a categorical variable and a numerical variable. It can present information regarding the numerical variable based on the categorical variable. Besides, using "point" and "box plot" might make some values in the "price" column overlap, which is not beneficial for understanding the distribution of the numerical value, and the "jitter" function can solve this problem.

ggplot(data =NYC_Airbnb_clean, mapping = aes(x = neighbourhood_group, y = price)) +
  geom_jitter(aes(color=neighbourhood_group))+
   scale_y_continuous(limits=range(NYC_Airbnb_clean$price),n.breaks= 15)+
   scale_color_discrete(name="Neighbourhood Group")+
labs(x= "Neighbourhood Group", y= "Price", title = "'Neighbourhood Group' and 'Price'")

# Analysis the relationship "Room Type" and "Availability"

##| Choosing the "box plot" because two variables include a categorical variable and a numerical variable. The graphic can present various "box plots, " including information regarding the numerical variable based on the categorical variable. 

ggplot(data =NYC_Airbnb_clean, mapping = aes(x = room_type, y = availability_365)) +
  geom_boxplot()+
  labs(x= "Room Type", y= "Availability", title = "'Room Type' and 'Availability'")

# Analysis the relationship "Neighbourhood Group" and "Avaliability"

##| Choosing the "box plot" because two variables include a categorical variable and a numerical variable. The graphic can present various "box plots, " including information regarding the numerical variable based on the categorical variable. 

ggplot(data =NYC_Airbnb_clean, mapping = aes(x = neighbourhood_group, y = availability_365)) +
  geom_boxplot()+
  labs(x= "Neighbourhood Group", y= "Availability", title = "'Neighbourhood Group' and 'Availability'")

# Analysis the relationship "Last Review Date" and "Price"

##| Choosing the "line" and "point" because the two variables include a categorical variable and a numerical variable, and using these two functions can make the graphic contain information about the third value, which is a categorical variable, through the different colors of the points. 
 
 ggplot(NYC_Airbnb_clean, aes(x=last_review, na.rm=TRUE, y=price)) +
    geom_line( color="black") +
    geom_point(aes(color= `room_type`)) +
  scale_x_date(limits=range(NYC_Airbnb_clean$last_review),breaks="1 years",date_labels = "%Y")+
   scale_y_continuous(limits=range(NYC_Airbnb_clean$price),n.breaks= 15)+
    scale_color_discrete(name="Room Type")+
   labs(x= "Last Review Date", y= "Price", title = "'Last Review Date' and 'Price'")

5.2 “pathogens” data set

# Analysis "Species" and "Cases"

##| Choosing the "point" function because 1) the two variables include a categorical variable and a numerical variable, and 2) the function can clearly show the location of the numerical variable based on the categorical variable.

ggplot(pathogens_clean, aes(x=Species_abbr, y=Cases)) +
    geom_point(aes(color= `Species_abbr`))+
  theme(axis.text.x = element_text(angle=90))+
  scale_y_continuous(labels= scales::label_number(suffix="M", scale=1e-6),n.breaks = 10)+
  scale_color_discrete(name="Species Abbreviation")+
  labs(x= "Species Abbreviation", title = "'Species' and 'Cases'")

 # Analysis "Species" and "Cost"

##| Choosing the "point" function because 1) the two variables include a categorical variable and a numerical variable, and 2) the function can clearly show the location of the numerical variable based on the categorical variable.

  ggplot(pathogens_clean, aes(x=Species_abbr, y=Cost)) +
  geom_point(aes(color= `Species_abbr`))+
  theme(axis.text.x = element_text(angle=90))+
   scale_y_continuous(labels=scales::label_currency(suffix="B",scale=1e-9),n.breaks = 10)+
     scale_color_discrete(name="Species Abbreviation")+
  labs(x= "Species Abbreviation", title = "'Species' and 'Cost'")