Import your data
big_tech_stock_prices <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-07/big_tech_stock_prices.csv')
## Rows: 45088 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): stock_symbol
## dbl (6): open, high, low, close, adj_close, volume
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_small <- big_tech_stock_prices %>%
select(stock_symbol, date, high, volume) %>%
filter(stock_symbol %in% c("AAPL", "NVDA", "GOOGL"))
data_small
## # A tibble: 9,813 × 4
## stock_symbol date high volume
## <chr> <date> <dbl> <dbl>
## 1 AAPL 2010-01-04 7.66 493729600
## 2 AAPL 2010-01-05 7.70 601904800
## 3 AAPL 2010-01-06 7.69 552160000
## 4 AAPL 2010-01-07 7.57 477131200
## 5 AAPL 2010-01-08 7.57 447610800
## 6 AAPL 2010-01-11 7.61 462229600
## 7 AAPL 2010-01-12 7.49 594459600
## 8 AAPL 2010-01-13 7.53 605892000
## 9 AAPL 2010-01-14 7.52 432894000
## 10 AAPL 2010-01-15 7.56 594067600
## # ℹ 9,803 more rows
Separating and Uniting
Separate a column
data_sep <- big_tech_stock_prices %>%
filter(stock_symbol %in% c("AAPL", "NVDA", "GOOGL")) %>%
separate(col = stock_symbol, into = c("company", "symbol_prefix"), sep = 2, remove = FALSE)
data_sep
## # A tibble: 9,813 × 10
## stock_symbol company symbol_prefix date open high low close
## <chr> <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl>
## 1 AAPL AA PL 2010-01-04 7.62 7.66 7.58 7.64
## 2 AAPL AA PL 2010-01-05 7.66 7.70 7.62 7.66
## 3 AAPL AA PL 2010-01-06 7.66 7.69 7.53 7.53
## 4 AAPL AA PL 2010-01-07 7.56 7.57 7.47 7.52
## 5 AAPL AA PL 2010-01-08 7.51 7.57 7.47 7.57
## 6 AAPL AA PL 2010-01-11 7.6 7.61 7.44 7.50
## 7 AAPL AA PL 2010-01-12 7.47 7.49 7.37 7.42
## 8 AAPL AA PL 2010-01-13 7.42 7.53 7.29 7.52
## 9 AAPL AA PL 2010-01-14 7.50 7.52 7.46 7.48
## 10 AAPL AA PL 2010-01-15 7.53 7.56 7.35 7.35
## # ℹ 9,803 more rows
## # ℹ 2 more variables: adj_close <dbl>, volume <dbl>
Unite two columns
data_united <- data_sep %>%
unite(col = "stock_symbol_reconstructed", company, symbol_prefix, sep = "")
data_united
## # A tibble: 9,813 × 9
## stock_symbol stock_symbol_reconstructed date open high low close
## <chr> <chr> <date> <dbl> <dbl> <dbl> <dbl>
## 1 AAPL AAPL 2010-01-04 7.62 7.66 7.58 7.64
## 2 AAPL AAPL 2010-01-05 7.66 7.70 7.62 7.66
## 3 AAPL AAPL 2010-01-06 7.66 7.69 7.53 7.53
## 4 AAPL AAPL 2010-01-07 7.56 7.57 7.47 7.52
## 5 AAPL AAPL 2010-01-08 7.51 7.57 7.47 7.57
## 6 AAPL AAPL 2010-01-11 7.6 7.61 7.44 7.50
## 7 AAPL AAPL 2010-01-12 7.47 7.49 7.37 7.42
## 8 AAPL AAPL 2010-01-13 7.42 7.53 7.29 7.52
## 9 AAPL AAPL 2010-01-14 7.50 7.52 7.46 7.48
## 10 AAPL AAPL 2010-01-15 7.53 7.56 7.35 7.35
## # ℹ 9,803 more rows
## # ℹ 2 more variables: adj_close <dbl>, volume <dbl>
Missing Values
missing_example <- big_tech_stock_prices %>%
filter(stock_symbol == "AAPL" & year(date) == 2022) %>%
select(date, high) %>%
mutate(high = ifelse(day(date) %% 5 == 0, NA, high)) # Add some NAs
missing_example
## # A tibble: 250 × 2
## date high
## <date> <dbl>
## 1 2022-01-03 183.
## 2 2022-01-04 183.
## 3 2022-01-05 NA
## 4 2022-01-06 175.
## 5 2022-01-07 174.
## 6 2022-01-10 NA
## 7 2022-01-11 175.
## 8 2022-01-12 177.
## 9 2022-01-13 177.
## 10 2022-01-14 174.
## # ℹ 240 more rows
# Handle missing values by replacing them with the average of non-missing values
missing_example_filled <- missing_example %>%
mutate(high = ifelse(is.na(high), mean(high, na.rm = TRUE), high))
missing_example_filled
## # A tibble: 250 × 2
## date high
## <date> <dbl>
## 1 2022-01-03 183.
## 2 2022-01-04 183.
## 3 2022-01-05 157.
## 4 2022-01-06 175.
## 5 2022-01-07 174.
## 6 2022-01-10 157.
## 7 2022-01-11 175.
## 8 2022-01-12 177.
## 9 2022-01-13 177.
## 10 2022-01-14 174.
## # ℹ 240 more rows