Assignment 2

Part 1A: Exponential transformation

result <- 1
Exp <- function(x, k) {
  for(i in 1:k) {
    result = result + (x^i)/factorial(i)
  }
  print(result)
}

Test

Exp(3, 2)

## [1] 8.5

Part 1B: Sample mean and sample standard deviation

sample_mean <- function(x) {
  y = sum(x)/length(x)
  y
}
sample_sd <- function(x){
  z = sqrt(sum((x - sum(x)/length(x))^2) / (length(x) - 1))
  z
}

Test

x <- c(3,5,7,7)
sample_mean(x)

## [1] 5.5

sample_sd(x)

## [1] 1.914854

Part 1C: Confidence intervals

calculate_CI <- function(x, conf=0.95){
  mean_value <- sample_mean(x)
  n <- length(x)
  standard_deviation <- sample_sd(x)
  standard_error <- standard_deviation / sqrt(n)
  alpha = 1 - conf
  degrees_of_freedom = n - 1
  t_score = qt(p=alpha/2, df=degrees_of_freedom, lower.tail=F)
  margin_error <- t_score * standard_error
  lower_bound <- mean_value - margin_error
  upper_bound <- mean_value + margin_error
  print(c(lower_bound,upper_bound))
}

Test

y <- c(9, 8, 8, 9, 1, 5)
calculate_CI(y, conf = 0.95)

## [1] 3.370258 9.963075

Did I get it right? Yes!

y <- c(9, 8, 8, 9, 1, 5)
dat = data.frame(x=y)
fit <- lm(x ~ 1, dat)
confint(fit, level=0.95)

##                2.5 %   97.5 %
## (Intercept) 3.370258 9.963075

Part 2: Wrangling data

tuesdata <- tidytuesdayR::tt_load('2020-01-07')

## --- Compiling #TidyTuesday Information for 2020-01-07 ----

## --- There are 11 files available ---

## --- Starting Download ---

## 
##  Downloading file 1 of 11: `fire_nrt_M6_94771.csv`
##  Downloading file 2 of 11: `IDCJAC0009_009151_1800_Data.csv`
##  Downloading file 3 of 11: `IDCJAC0009_023011_1800_Data.csv`
##  Downloading file 4 of 11: `IDCJAC0009_040383_1800_Data.csv`
##  Downloading file 5 of 11: `IDCJAC0009_040913_1800_Data.csv`
##  Downloading file 6 of 11: `IDCJAC0009_066062_1800_Data.csv`
##  Downloading file 7 of 11: `IDCJAC0009_070351_1800_Data.csv`
##  Downloading file 8 of 11: `IDCJAC0009_086232_1800_Data.csv`
##  Downloading file 9 of 11: `MODIS_C6_Australia_and_New_Zealand_7d.csv`
##  Downloading file 10 of 11: `rainfall.csv`
##  Downloading file 11 of 11: `temperature.csv`

## --- Download complete ---

rainfall <- tuesdata$rainfall
temperature <- tuesdata$temperature

Tasks

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.1.2

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.0 
## ✔ readr   2.1.2      ✔ forcats 0.5.1

## Warning: package 'ggplot2' was built under R version 4.1.2

## Warning: package 'tibble' was built under R version 4.1.2

## Warning: package 'tidyr' was built under R version 4.1.2

## Warning: package 'readr' was built under R version 4.1.2

## Warning: package 'dplyr' was built under R version 4.1.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

df <- drop_na(rainfall)
df <- df %>% 
  mutate(date=paste0(year, "-", month, "-", day)) 
df$date = as.Date(df$date)
df = select(df, -c(4,5))
  
df$city_name <- toupper(df$city_name)

df1 <- inner_join(x=df, y=temperature, by=c("date" = "date", "city_name" = "city_name"))
df1

## # A tibble: 83,964 × 13
##    station…¹ city_…²  year rainf…³ period quality   lat  long stati…⁴ date      
##    <chr>     <chr>   <dbl>   <dbl>  <dbl> <chr>   <dbl> <dbl> <chr>   <date>    
##  1 009151    PERTH    1967     2.8      1 Y       -32.0  116. Subiac… 1967-07-05
##  2 009151    PERTH    1967     2.8      1 Y       -32.0  116. Subiac… 1967-07-05
##  3 009151    PERTH    1967     4.8      1 Y       -32.0  116. Subiac… 1967-07-06
##  4 009151    PERTH    1967     4.8      1 Y       -32.0  116. Subiac… 1967-07-06
##  5 009151    PERTH    1967     5.8      1 Y       -32.0  116. Subiac… 1967-07-07
##  6 009151    PERTH    1967     5.8      1 Y       -32.0  116. Subiac… 1967-07-07
##  7 009151    PERTH    1967    16        1 Y       -32.0  116. Subiac… 1967-07-10
##  8 009151    PERTH    1967    16        1 Y       -32.0  116. Subiac… 1967-07-10
##  9 009151    PERTH    1967     1        1 Y       -32.0  116. Subiac… 1967-07-11
## 10 009151    PERTH    1967     1        1 Y       -32.0  116. Subiac… 1967-07-11
## # … with 83,954 more rows, 3 more variables: temperature <dbl>,
## #   temp_type <chr>, site_name <chr>, and abbreviated variable names
## #   ¹station_code, ²city_name, ³rainfall, ⁴station_name

Part 3: Data visualization

Restructure the data

df2 <- df1 %>% 
  filter(year >= 2014) %>% 
  pivot_wider(names_from = temp_type, values_from = temperature)

To plot

ggplot(df2, aes(x = date, fill = city_name))+
  geom_line(aes(y = max, color = "blue")) +
  geom_line(aes(y = min, color = "yellow")) +
  facet_grid(. ~ city_name) +
  labs(title = "Temperatures over time", x = "Date", y = "Temperature", color = " ",
       subtitle = "In general, both the maximuma and minimum temperatures have been increasing since 2014",
       caption = "Created by Ying Zhang") +
  scale_color_manual(labels = c("Max", "Min"), values = c("coral3", "aquamarine2"))

## Part 3B ### How I chose this and why? First, I wrote a function called rainfall which takes in two inputs:city for the name of the city and y for the year I want to plot. Withinin the function, I used the if-else function to test if the input names and years exist. If they do, the function will move on to plot the rainfall data using ggplot2.

rainfall_dist <- function(city, y) {
 if (!toupper(city) %in% df1$city_name){
    stop("City name doesn't exist!")
 }
  else if (!y %in% df1$year){
    stop("Year doesn't exist!")
  }
  else{
  df1 %>% 
    filter(city_name == toupper(city) & year == as.numeric(y)) %>% 
    ggplot(aes(log(rainfall))) +
      geom_histogram()
  }}
rainfall_dist("sydney", 2015)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 396 rows containing non-finite values (stat_bin).

Part 4A

rain_df <- df1 %>% 
  filter(year >= 2014) %>% 
  select(year, city_name, rainfall) %>% 
  group_by(year, city_name) %>% 
  summarise(rainfall_mean = sample_mean(rainfall), 
            rainfall_sd = sample_mean(rainfall),
            lower_bound = calculate_CI(rainfall)[c(1)],
            upper_bound = calculate_CI(rainfall)[c(2)])

## [1] 2.992674 4.694608
## [1] 1.217762 1.923674
## [1] 3.180624 4.486576
## [1] 5.900879 9.845463
## [1] 2.012387 2.958250
## [1] 4.533109 7.816081
## [1] 1.134468 1.888247
## [1] 3.306707 4.819419
## [1] 6.124078 9.473105
## [1] 2.833977 4.498626
## [1] 1.465417 2.719706
## [1] 1.698006 2.609098
## [1] 3.729219 4.919189
## [1] 6.34664 8.87211
## [1] 2.924211 4.647374
## [1] 1.971184 3.467019
## [1] 1.038765 1.638921
## [1] 4.231374 6.162122
## [1] 6.038234 9.768359
## [1] 2.295725 3.630446
## [1] 1.752538 3.185393
## [1] 0.926316 1.667091
## [1] 3.658153 5.494390
## [1]  6.045517 10.599645
## [1] 2.085155 3.410450
## [1] 1.376062 2.818605
## [1] 0.9609772 2.1701486
## [1] 2.100971 3.347918
## [1] 1.859987 4.266329
## [1] 1.632691 3.522276
## [1] 2.992674 4.694608
## [1] 1.217762 1.923674
## [1] 3.180624 4.486576
## [1] 5.900879 9.845463
## [1] 2.012387 2.958250
## [1] 4.533109 7.816081
## [1] 1.134468 1.888247
## [1] 3.306707 4.819419
## [1] 6.124078 9.473105
## [1] 2.833977 4.498626
## [1] 1.465417 2.719706
## [1] 1.698006 2.609098
## [1] 3.729219 4.919189
## [1] 6.34664 8.87211
## [1] 2.924211 4.647374
## [1] 1.971184 3.467019
## [1] 1.038765 1.638921
## [1] 4.231374 6.162122
## [1] 6.038234 9.768359
## [1] 2.295725 3.630446
## [1] 1.752538 3.185393
## [1] 0.926316 1.667091
## [1] 3.658153 5.494390
## [1]  6.045517 10.599645
## [1] 2.085155 3.410450
## [1] 1.376062 2.818605
## [1] 0.9609772 2.1701486
## [1] 2.100971 3.347918
## [1] 1.859987 4.266329
## [1] 1.632691 3.522276

## `summarise()` has grouped output by 'year'. You can override using the `.groups`
## argument.

rain_df

## # A tibble: 30 × 6
## # Groups:   year [6]
##     year city_name rainfall_mean rainfall_sd lower_bound upper_bound
##    <dbl> <chr>             <dbl>       <dbl>       <dbl>       <dbl>
##  1  2014 BRISBANE           3.84        3.84        2.99        4.69
##  2  2014 CANBERRA           1.57        1.57        1.22        1.92
##  3  2014 MELBOURNE          3.83        3.83        3.18        4.49
##  4  2014 PERTH              7.87        7.87        5.90        9.85
##  5  2014 SYDNEY             2.49        2.49        2.01        2.96
##  6  2015 BRISBANE           6.17        6.17        4.53        7.82
##  7  2015 CANBERRA           1.51        1.51        1.13        1.89
##  8  2015 MELBOURNE          4.06        4.06        3.31        4.82
##  9  2015 PERTH              7.80        7.80        6.12        9.47
## 10  2015 SYDNEY             3.67        3.67        2.83        4.50
## # … with 20 more rows

Part 4B

ggplot(rain_df, aes(x = year, fill = city_name))+
  geom_point(aes(y = rainfall_mean, color = "blue")) +
  geom_line(aes(y = rainfall_mean, color = "blue")) +
  geom_errorbar(aes(ymin = lower_bound, ymax = upper_bound), width=.2) +
  facet_grid(city_name ~ .) +
  labs(title = "Rainfall mean and standard deviation", x = "Year", y = "Rainfall", color = " ",
       subtitle = "Perth had the heaviest rainfall among the 5 cities. Rainfall decreased from 2018 in all cities except for Canberra.",
       caption = "Created by Ying Zhang") +
  scale_color_manual(labels = c("Mean"), values = c("coral3"))

```

Assignment 2

Ying Zhang

9/27/2022

Part 1A: Exponential transformation

Test

Part 1B: Sample mean and sample standard deviation

Test

Part 1C: Confidence intervals

Test

Did I get it right? Yes!

Part 2: Wrangling data

Tasks

Part 3: Data visualization

Restructure the data

To plot

Part 4A

Part 4B