{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, cache = TRUE, cache.lazy = FALSE )

{r, message = FALSE, warning = FALSE} library(naniar) library(broom) library(ggmap) library(knitr) library(lubridate) library(rwalkr) library(sugrrants) library(timeDate) library(tsibble) library(here) library(readr) library(tidyverse) library(ggResidpanel) library(gridExtra) library(kableExtra)

{r , readingdata, message = FALSE, warning = FALSE, eval = FALSE} tree_data0 <- Trees_with_species_and_dimensions_Urban_Forest_




# Question 1: Rename the variables *Date Planted* and *Year Planted* to *Dateplanted* and *Yearplanted* using the *rename()* function. Make sure *Dateplanted* is defined as a **date variable**. Then extract from the variable *Dateplanted* the year and store it in a new variable called  *Year*. (6pts)

```{r, eval = FALSE}
tree_data <- tree_data0 %>% 
         rename(c("Dateplanted" = "Date Planted","Yearplanted" = "Year Planted"))%>%
  mutate(Dateplanted = dmy(Dateplanted))%>%
  mutate(Year = year(Dateplanted))

Question 2: Investigate graphically the missing values in the variable Dateplanted. What do you observe? (max 30 words) (4pts)

{r, eval = FALSE} vis_miss(tree_data, warn_large_data = FALSE) #I can not see anything

Question 3: What is the proportion of missing values in each variable in the tree data set? Display the results arranged from larger to lower proportion. (2pts)

{r, eval = FALSE} miss_var_summary(tree_data) %>% arrange(-pct_miss)

Question 4: Remove all the rows in the data set for which the variable Dateplanted has a missing value recorded. Use R inline code to complete the sentense below (4pts)

{r, eval = FALSE} tree_data_clean0 <- tree_data %>% filter(!is.na(Dateplanted)) The number of rows in the cleaned data set are --- and the number of columns are ----

Question 5: There are a number of trees for which the Dateplanted was not stored correctly in the data set. Find those trees and amend the Year variable accordingly. (Hint: The data set contains information about the year when the trees where planted, you might use that variable to locate those trees for which Dateplanted was not recorded properly and amend the Year variable). Replacing the variable Year by Year Planted is not a valid solution, you need to find the entries that are incorrectly coded in the variable Year and replace those with the correct year information. Once you have amended the variable Year, demonstrate that both Year and Yearplanted are the same. Please use this cleaned data set for the rest of the questions in the assignment. (4pts)

{r, eval = FALSE} ifelse(tree_data_clean0$Year==tree_data_clean0$Yearplanted,tree_data_clean0$Year, tree_data_clean0$Yearplanted)

Question 6: Create a map with the tree locations in the data set (4pts)

```{r , message = FALSE, warning = FALSE, eval = FALSE} # We have created the map below for you melb_map <- read_rds(“Data/melb-map.rds”)

Here you just need to add the location for each tree into the map.

ggmap(melb_map) + geom_point(tree_data_clean0, mapping = aes(Longitude, Latitude), colour = “#006400”, alpha = 0.5, size = 0.2)



# Question 7: Repeat the map and draw trees in the *Genus* group of Eucalyptus, Macadamia, Prunus, Acacia, Quercus. Use the "Dark2" color palette and display the legend at the bottom of the plot. (6pts)

```{r, eval = FALSE}
selected_group <- tree_data %>%
  group_by(Genus)%>%
  filter(Genus%in%
           c("Eucalyptus","Macadamia","Prunus","Acacia","Quercus"))

```{r , message = FALSE, warning = FALSE, eval = FALSE} ggmap(melb_map) + geom_point(selected_group, mapping = aes(Longitude, Latitude),

         alpha = 0.5, 
         size = 0.2)

scale_color_brewer(palette = “Dark2”)


# Question 8: How many trees are in Melbourne according to this data set?  (1pt)

The number of trees described in the data set is `----`

# Question 9: Filter the data *tree_data_clean* so that you display only the variables "Year", "Located in", "Common Name", arrange the data set by *Year* in descending order and display the first 4 lines. Call this new data set *tree_data_clean_filter*. Then answer the following question using inline R code: When (Year), where and the common name of the tree that was first tree planted in Melbourne according to this data set? (8pts)

```{r}
tree_data_clean_filter <- tree_data_clean0 %>%
  select("Year", "Located in", "Common Name") %>%
arrange(Year)
head(tree_data_clean_filter,4)

The first tree was planted in 1997 at a park and the tree name is spotted gum

Question 10: How many trees are planted in parks and streets? Display only the results for parks and streets using the function kable() from the kableExtra R package to produce a table. Hint: Use to call variables that have a space in the name. (4pts)

{r, eval = FALSE} tree_data_clean0 %>% select(`Located in`) %>% group_by(`Located in`) %>% summarise(n()) %>% kable(caption = "Trees planted") %>% kable_styling(bootstrap_options = c("striped", "hover")) kable(head(mtcars[, 1:4]), "simple")

Question 11: How many trees are in each of the Family groups in the data set? (1pt)

tree_data_clean0 %>%
  filter(Family != "NA") %>%
  select(Family) %>%
  group_by(Family) %>%
  summarise(n())

Question 12: Create a new variable called Index in tree_data_clean that has value 1 for each row of the data set. (2pts)

{r,eval = FALSE} tree_data_clean0 <- tree_data_clean0 %>% seq(from = 1, to = 1, by = ((to - from)/(length.out - 1)), length.out = NULL, along.with = NULL)

Question 13: Create a markdown table (hint: use kable() from the gridExtra R package) displaying the number of trees planted in 1899, 1900, 1995, 2000, 2019, 2020 with common names Ironbark, Olive, Plum, Oak, and Elm. What is the oldest tree in this group? (6pts)

``{r, eval = FALSE} library(kableExtra) tree_data_clean0 %>% filter(Year == c(1899, 1900, 1995, 2000, 2019, 2020),Common Name== c("Ironbark", "Olive", "Plum", "Oak", "Elm")) %>% select(Year,Common Name) %>% group_by(Year,Common Name`) %>% summarise(n()) %>% kable() %>% kable_styling(bootstrap_options = c(“striped”, “hover”))

# Question 14: Select the trees with diameters (Diameter Breast Height) greater than 80 and smaller 100 cm and comment where the trees are located (Streets or Parks, max 25 words) (6pts)

```{r,eval=FALSE}
large_trees_data <- tree_data_clean0 %>%
filter(c(`Diameter Breast Height`> 80,
       `Diameter Breast Height`<100)) %>%
  group_by(`Located in`) %>%
  summarise(n())

Question 15: Plot the trees of the diameter that you selected in question 14 which are located in parks and streets in a map and choose different color for the trees in streets and trees in parks. (12pts)

{r, eval = FALSE} large_trees_data_parks <- tree_data_clean0 %>% filter(`Diameter Breast Height` > 80, `Diameter Breast Height` <100)
Large trees seem to be concentrated on in certain streets.

{r, message = FALSE, warning = FALSE, eval = FALSE} ggmap(melb_map) + geom_point(large_trees_data_parks, mapping = aes(x = Longitude, y = Latitude, colour = `Located in`), alpha = 0.9, size = 0.2)

#Question 16: Do you see any pattern in the locations of the trees in the map that you produced in Question 15? Comment on the results (max 30 words) (2pts)

no,I can not see anything, There must be mistake

Question 17: Create a boxplot displaying the number of trees by Family planted per year from 1999? What can you conclude from the plot? (6pts)

{r, eval = FALSE} tree_data_clean0 %>% filter(Year >= 1999)%>% group_by(family) %>% count(family,year)%>% ggplot(aes(x = family, y = n, group = family)) geom_boxplot()

Question 18: Create a time series plot (using geom_line) that display the total number of trees planted per year starting from 2006. (6pts)

{r, eval = FALSE} tree_data_clean_2006 <- tree_data_clean0 %>% filter(Year >= 2006)%>% group_by(Year) %>% summarise(n = n())

Question 19: Fit a linear regression model to understand the relationship between the number of trees planted (log transformed) over time (years) using the data you created in Question 18 . Display the model summary results and comment on the results (max 30 words). (2pts)

{r, eval = FALSE} IDK

Question 20: Create a figure to display the diagnostics plots of the linear model that you fit in Question 19. Comment on the diagnostic plots (max 30 words). Is this a good/bad model and why? (Max 30 words) (6pts)

{r, eval = FALSE} ggplot(tree_data_clean_fliter, aes(Year, n)) + geom_line() + ylab("number of trees") + ggtitle("number of trees which is planted per year in 2006") +

Question 21: Report R2, Radjusted, AIC and BIC. Is this a good/bad model? Please explain your answer.(Max 30 words) (2pts)

{r, eval = FALSE} -#May be it is bad model, but ,sorry i can not run it

Question 22: Run the code below to read the pedestrian counts to select data from 4 sensors. Fill in the gaps so that the “selected” data frame contains the selected_sensors and the “nonselected” data frame contains the other sensors. (2pts)

```{r, eval = FALSE}
ped_loc <- pull_sensor() %>% filter(status == “A”)

selected_sensors <- c(“Victoria Point”, “Melbourne Central”, “Flinders Street Station Underpass”, “Lonsdale St-Spring St (West)”)

# identify those sensors that are selected <- ped_loc %>% filter(——) nonselected <- ped_loc %>% filter(——-)


  
  
# Question 23: Add the sensors locations (both selected and nonselected) into the map below. The *selected sensors* should be display with a diamond shape (Hint: shape = 18) (6pts)
  
```{r, eval = FALSE}
  melb_map <- read_rds(here::here("assignment-2/data-raw/melb-map.rds"))

{r, eval = FALSE} ggmap(melb_map) + geom_point(nonselected, mapping = aes(longitude, latitude), colour = "#2b8cbe", alpha = 0.6, size = 2) + geom_point(data = selected, mapping = aes(x = longitude, y = latitude, colour = sensor), size = 3, shape = 18) + labs(x = "longitude", y = "latitude") + scale_color_brewer(palette = "Spectral", name = "sensor" ) + guides(col = guide_legend(nrow = 2)

Question 24: Now extract data for the year 2019 (Jan 1st - Dec 31st) and 2020 up to August 1st using the `rwalkr` package (Hint: You should use the melb_walk() function). Please set the R code chunk option to cache = TRUE so it does not run every time you knit (it takes several minutes to extract the data for the first time) (2pts)

{r, eval = FALSE} library(rwalkr) walk_data <- melb_walk(from = as.Date("2019-01-01"), to = as.Date("2020-08-01"))

Question 25: Filter the data down to include only the four sensors we looked at in Question 22 and create three new variables (extracting the information from the Date variable) that display the day, month and year. Display the first 3 rows

```{r, eval = FALSE} walk_data_subset <- walk_data %>% filter(Sensor %in% selected_sensors) %>% mutate(Day = day(Date), Month = month(Date), Year = year(Date))



# Question 26:  Create a **function** that takes pedestrian counts data as input and produces boxplots for each `year` while faceting over `Sensor` (use facet_wrap). Use this function to create boxplots for year 2019 and 2020 for all 4 sensors selected in *walk_data_subset*. What do you observe in this figure?. Does this figure make comparison between sensors across years easy, explain your answer? (max 30 words). (8pts)

```{r, eval = FALSE}

plot_function <- function(data)
  
mutate(Year = as.character(Year)) %>%
    ggplot(aes(x = Year, y = Count)) +
    coord_flip() +
    geom_boxplot() +
    facet_wrap(~Sensor)
  

}

Question 27: Create a function that takes 5 input variables: year, start date, end date, a pedestrian count dataframe and sensor name. This function will produce a line plot displaying the total number of pedestrian counts (total pedestrian count) for each day for the chosen (input) sensor. The line plot should have the dates on the x-axis and on the y-axis, display the total number of pedestrian counts (total pedestrian count) for input range (start and end date). Then using this function examine the counts for the week 3: May 2020 - 10 May 2020 for the sensor “Lonsdale St-Spring St (West)” which is the closest to the Carlton park (an area with lots of trees as you can see in the map that you created above!). (16pts)

```{r, eval = FALSE}



# Question 28: What can you conclude from  the plot that you created in Question 27 (max 30 words)? (2pts)
sorry I dont know how to do it, I think that is too hard for me
  
# Question 29: Read in the weather data and create flagging variables to the data set (10 pts). Below you will find more information about the weather data:

One question we want to answer is: "Does the weather make a difference to the number of people walking out?"

Time of day and day of week are the predominant driving force of the number of pedestrian, depicted in the previous data plots. Apart from these temporal factors, the weather condition could possibly affect how many people are walking in the city. In particular, people are likely to stay indoors, when the day is too hot or too cold, or raining hard. 

Daily meteorological data as a separate source, available on [National Climatic Data Center](https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/), is used and joined to the main pedestrian data table using common dates. 

Binary variables are created to serve as the tipping points

We have pulled information on weather stations for the Melbourne area - can you combine it together into one dataset?
  
- high_prcp if `prcp` > 5 (if yes, "rain", if no, "none")
- high_temp if `tmax` > 25 (if yes, "hot", if no, "not")
- low_temp if `tmin` < 6 (if yes, "cold", if no, "not")




```{r, eval = FALSE}
# Now create some flag variables 
melb_weather_2020 <- read_csv("assignment-2/data/melb_ncdc_2020.csv") %>% 
melb_weather_2020 <- read_csv("Data/melb_ncdc_2020.csv") %>% 
  mutate(
    high_prcp = if_else(condition = prcp >= 5,
                        true = "rain",
                        false = "none"),
    high_temp = if_else(condition = tmax >= 25,
                        true = "hot",
                        false = "not"),
    low_temp = if_else(condition = tmin <= 6,
                        true = "cold",
                        false = "not"))



# Question 30: Select the pedestrian count data  only for the dates recorded in the weather data and combine those two datasets so that the resulting combined data set contains the variables from both datasets (Hint: Look at the materials for week 4). Display the first 3 rows of the new data set and the report the dimension of the new data set. (6pts)

```{r, eval = FALSE}
walk_data_subset_2020 <- walk_data_subset %>%
  filter(Date %in% melb_weather_2020$date)

combined_data <- melb_weather_2020 %>% 
  left_join(walk_data_subset_2020, 
            by = c("date" = "Date"))
            
head()
-----

Question 31: Explain why the number of rows you reported in Question 30 is correct. (Max 30 words)

Question 32: Calculate the mean “max temperature” and mean of the “min temperature” for each month for each sensor. Report the first 6 rows. Why are there repeated results? (No more than 25 words) (4pts)

{r, eval = FALSE} combined_data %>% group_by(Month, Sensor) %>% summarise(mean_tmax = mean(tmax, na.rm = TRUE), mean_tmin = mean(tmin, na.rm = TRUE)) %>% head(6)

Question 33: Explore the sensor counts against the weather flagging variables using boxplots as follows. Create 3 figures that look at the distribution of the daily totals for each of the sensors, according to the weather flagging variables (`high_prcp`, `high_temp`, and `low_temp`). Make sure the legend on the x-axis is fully visible. Answer the following question: Does the weather make a difference to the number of people walking out? (Max 40 words) (9pts)

{r, eval = FALSE} ggplot(combined_data, --------

{r, eval = FALSE} ggplot(combined_data, ------- {r, eval = FALSE} ggplot(combined_data, ------

Extra for ETC5510: Using the tree data set, the weather data and the pedestrian count data set select two research questions that you would like to investigate (you will need to use at least two of the aforementioned data sets). Write the question and the produce the analyses and visualizations required to answer your questions. (30 points for each question)

Assignment 2, Semester 2, 2020

ETC5510

linlingbai

2020/10/21