Week 8: Apply it to your data 7

Import your data

big_tech_stock_prices <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-07/big_tech_stock_prices.csv')

## Rows: 45088 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): stock_symbol
## dbl  (6): open, high, low, close, adj_close, volume
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data_small <- big_tech_stock_prices %>%
    
  select(stock_symbol, date, high, volume) %>%
  filter(stock_symbol %in% c("AAPL", "NVDA", "GOOGL")) 

data_small

## # A tibble: 9,813 × 4
##    stock_symbol date        high    volume
##    <chr>        <date>     <dbl>     <dbl>
##  1 AAPL         2010-01-04  7.66 493729600
##  2 AAPL         2010-01-05  7.70 601904800
##  3 AAPL         2010-01-06  7.69 552160000
##  4 AAPL         2010-01-07  7.57 477131200
##  5 AAPL         2010-01-08  7.57 447610800
##  6 AAPL         2010-01-11  7.61 462229600
##  7 AAPL         2010-01-12  7.49 594459600
##  8 AAPL         2010-01-13  7.53 605892000
##  9 AAPL         2010-01-14  7.52 432894000
## 10 AAPL         2010-01-15  7.56 594067600
## # ℹ 9,803 more rows

Pivoting

long to wide form

data_year <- big_tech_stock_prices %>%
  mutate(year = year(date))

# Pivot the data to make it longer
data_long <- big_tech_stock_prices %>%
  pivot_longer(cols      = c(high, low),          
               names_to  = "price_type",       
               values_to = "price")

annual_summary <- data_year %>%
  filter(year %in% c(2020, 2021, 2022, 2023),                
         stock_symbol %in% c("AAPL", "NVDA", "GOOGL")) %>%
    
  group_by(year, stock_symbol) %>%                   
  summarize(
    max_high = max(high, na.rm = TRUE),
    min_low = min(low, na.rm = TRUE),
    .groups = "drop")

annual_summary

## # A tibble: 9 × 4
##    year stock_symbol max_high min_low
##   <dbl> <chr>           <dbl>   <dbl>
## 1  2020 AAPL            139.     53.2
## 2  2020 GOOGL            92.2    50.4
## 3  2020 NVDA            147.     45.2
## 4  2021 AAPL            182.    116. 
## 5  2021 GOOGL           151.     84.8
## 6  2021 NVDA            346.    116. 
## 7  2022 AAPL            183.    126. 
## 8  2022 GOOGL           152.     83.3
## 9  2022 NVDA            307.    108.

wide to long form

wide_data <- annual_summary %>%
  pivot_wider(
    names_from = stock_symbol, 
    values_from = c(max_high, min_low))

wide_data

## # A tibble: 3 × 7
##    year max_high_AAPL max_high_GOOGL max_high_NVDA min_low_AAPL min_low_GOOGL
##   <dbl>         <dbl>          <dbl>         <dbl>        <dbl>         <dbl>
## 1  2020          139.           92.2          147.         53.2          50.4
## 2  2021          182.          151.           346.        116.           84.8
## 3  2022          183.          152.           307.        126.           83.3
## # ℹ 1 more variable: min_low_NVDA <dbl>

Separating and Uniting

Separate a column

data_sep <- big_tech_stock_prices %>%
  filter(stock_symbol %in% c("AAPL", "NVDA", "GOOGL")) %>%
  separate(col = stock_symbol, into = c("company", "symbol_prefix"), sep = 2, remove = FALSE)

data_sep

## # A tibble: 9,813 × 10
##    stock_symbol company symbol_prefix date        open  high   low close
##    <chr>        <chr>   <chr>         <date>     <dbl> <dbl> <dbl> <dbl>
##  1 AAPL         AA      PL            2010-01-04  7.62  7.66  7.58  7.64
##  2 AAPL         AA      PL            2010-01-05  7.66  7.70  7.62  7.66
##  3 AAPL         AA      PL            2010-01-06  7.66  7.69  7.53  7.53
##  4 AAPL         AA      PL            2010-01-07  7.56  7.57  7.47  7.52
##  5 AAPL         AA      PL            2010-01-08  7.51  7.57  7.47  7.57
##  6 AAPL         AA      PL            2010-01-11  7.6   7.61  7.44  7.50
##  7 AAPL         AA      PL            2010-01-12  7.47  7.49  7.37  7.42
##  8 AAPL         AA      PL            2010-01-13  7.42  7.53  7.29  7.52
##  9 AAPL         AA      PL            2010-01-14  7.50  7.52  7.46  7.48
## 10 AAPL         AA      PL            2010-01-15  7.53  7.56  7.35  7.35
## # ℹ 9,803 more rows
## # ℹ 2 more variables: adj_close <dbl>, volume <dbl>

Unite two columns

data_united <- data_sep %>%
  unite(col = "stock_symbol_reconstructed", company, symbol_prefix, sep = "")

data_united

## # A tibble: 9,813 × 9
##    stock_symbol stock_symbol_reconstructed date        open  high   low close
##    <chr>        <chr>                      <date>     <dbl> <dbl> <dbl> <dbl>
##  1 AAPL         AAPL                       2010-01-04  7.62  7.66  7.58  7.64
##  2 AAPL         AAPL                       2010-01-05  7.66  7.70  7.62  7.66
##  3 AAPL         AAPL                       2010-01-06  7.66  7.69  7.53  7.53
##  4 AAPL         AAPL                       2010-01-07  7.56  7.57  7.47  7.52
##  5 AAPL         AAPL                       2010-01-08  7.51  7.57  7.47  7.57
##  6 AAPL         AAPL                       2010-01-11  7.6   7.61  7.44  7.50
##  7 AAPL         AAPL                       2010-01-12  7.47  7.49  7.37  7.42
##  8 AAPL         AAPL                       2010-01-13  7.42  7.53  7.29  7.52
##  9 AAPL         AAPL                       2010-01-14  7.50  7.52  7.46  7.48
## 10 AAPL         AAPL                       2010-01-15  7.53  7.56  7.35  7.35
## # ℹ 9,803 more rows
## # ℹ 2 more variables: adj_close <dbl>, volume <dbl>

Missing Values

missing_example <- big_tech_stock_prices %>%
  filter(stock_symbol == "AAPL" & year(date) == 2022) %>%
  select(date, high) %>%
  mutate(high = ifelse(day(date) %% 5 == 0, NA, high))  # Add some NAs

missing_example

## # A tibble: 250 × 2
##    date        high
##    <date>     <dbl>
##  1 2022-01-03  183.
##  2 2022-01-04  183.
##  3 2022-01-05   NA 
##  4 2022-01-06  175.
##  5 2022-01-07  174.
##  6 2022-01-10   NA 
##  7 2022-01-11  175.
##  8 2022-01-12  177.
##  9 2022-01-13  177.
## 10 2022-01-14  174.
## # ℹ 240 more rows

# Handle missing values by replacing them with the average of non-missing values
missing_example_filled <- missing_example %>%
  mutate(high = ifelse(is.na(high), mean(high, na.rm = TRUE), high))

missing_example_filled

## # A tibble: 250 × 2
##    date        high
##    <date>     <dbl>
##  1 2022-01-03  183.
##  2 2022-01-04  183.
##  3 2022-01-05  157.
##  4 2022-01-06  175.
##  5 2022-01-07  174.
##  6 2022-01-10  157.
##  7 2022-01-11  175.
##  8 2022-01-12  177.
##  9 2022-01-13  177.
## 10 2022-01-14  174.
## # ℹ 240 more rows

Week 8: Apply it to your data 7

Ronja Dahlin

2024-10-15

Import your data

Pivoting

long to wide form

wide to long form

Separating and Uniting

Separate a column

Unite two columns

Missing Values