# excel file
data <- read_excel("myData_charts.xlsx")
data
## # A tibble: 45,090 × 10
## stock_symbol date open high low close adj_close volume
## <chr> <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AAPL 2010-01-04 00:00:00 7.62 7.66 7.58 7.64 6.52 493729600
## 2 AAPL 2010-01-05 00:00:00 7.66 7.70 7.62 7.66 6.53 601904800
## 3 AAPL 2010-01-06 00:00:00 7.66 7.69 7.53 7.53 6.42 552160000
## 4 AAPL 2010-01-07 00:00:00 7.56 7.57 7.47 7.52 6.41 477131200
## 5 AAPL 2010-01-08 00:00:00 7.51 7.57 7.47 7.57 6.45 447610800
## 6 AAPL 2010-01-11 00:00:00 7.6 7.61 7.44 7.50 6.40 462229600
## 7 AAPL 2010-01-12 00:00:00 7.47 7.49 7.37 7.42 6.32 594459600
## 8 AAPL 2010-01-13 00:00:00 7.42 7.53 7.29 7.52 6.41 605892000
## 9 AAPL 2010-01-14 00:00:00 7.50 7.52 7.46 7.48 6.38 432894000
## 10 AAPL 2010-01-15 00:00:00 7.53 7.56 7.35 7.35 6.27 594067600
## # ℹ 45,080 more rows
## # ℹ 2 more variables: Column1 <lgl>, HPR <dbl>
# Selecting stocks and their closing prices
selected_stocks <- c("AAPL", "ADBE", "AMZN", "CRM", "CSCO", "GOOGL", "IBM", "INTC", "META", "MSFT", "NFLX", "NVDA", "ORCL", "TSLA")
primary_keys_check <- data %>%
count(stock_symbol, date, close)%>%
filter(n > 1)
primary_keys_check
## # A tibble: 1 × 4
## stock_symbol date close n
## <chr> <dttm> <dbl> <int>
## 1 <NA> NA NA 2
So, the primary key combination for my data set are the variables stock symbol, date, and close.
Divide it using dplyr::select in a way the two have a common variable, which you could use to join the two.
I chose stock_symbol as common variable since all the other variables are related to this one variable.
data_part1 <- data %>% select(stock_symbol:low, close) %>% head(50)
data_part2 <- data %>% select(stock_symbol, close:HPR) %>% head(50)
data_part1
## # A tibble: 50 × 6
## stock_symbol date open high low close
## <chr> <dttm> <dbl> <dbl> <dbl> <dbl>
## 1 AAPL 2010-01-04 00:00:00 7.62 7.66 7.58 7.64
## 2 AAPL 2010-01-05 00:00:00 7.66 7.70 7.62 7.66
## 3 AAPL 2010-01-06 00:00:00 7.66 7.69 7.53 7.53
## 4 AAPL 2010-01-07 00:00:00 7.56 7.57 7.47 7.52
## 5 AAPL 2010-01-08 00:00:00 7.51 7.57 7.47 7.57
## 6 AAPL 2010-01-11 00:00:00 7.6 7.61 7.44 7.50
## 7 AAPL 2010-01-12 00:00:00 7.47 7.49 7.37 7.42
## 8 AAPL 2010-01-13 00:00:00 7.42 7.53 7.29 7.52
## 9 AAPL 2010-01-14 00:00:00 7.50 7.52 7.46 7.48
## 10 AAPL 2010-01-15 00:00:00 7.53 7.56 7.35 7.35
## # ℹ 40 more rows
data_part2
## # A tibble: 50 × 6
## stock_symbol close adj_close volume Column1 HPR
## <chr> <dbl> <dbl> <dbl> <lgl> <dbl>
## 1 AAPL 7.64 6.52 493729600 NA 0.00272
## 2 AAPL 7.66 6.53 601904800 NA -0.00103
## 3 AAPL 7.53 6.42 552160000 NA -0.0159
## 4 AAPL 7.52 6.41 477131200 NA -0.00553
## 5 AAPL 7.57 6.45 447610800 NA 0.00799
## 6 AAPL 7.50 6.40 462229600 NA -0.0126
## 7 AAPL 7.42 6.32 594459600 NA -0.00703
## 8 AAPL 7.52 6.41 605892000 NA 0.0134
## 9 AAPL 7.48 6.38 432894000 NA -0.00324
## 10 AAPL 7.35 6.27 594067600 NA -0.0237
## # ℹ 40 more rows
Use tidyr::left_join or other joining functions.
data_joined <- left_join(data_part1, data_part2)
## Joining with `by = join_by(stock_symbol, close)`
# Confirm data_joined = data
setdiff(data_joined, data)
## # A tibble: 0 × 10
## # ℹ 10 variables: stock_symbol <chr>, date <dttm>, open <dbl>, high <dbl>,
## # low <dbl>, close <dbl>, adj_close <dbl>, volume <dbl>, Column1 <lgl>,
## # HPR <dbl>
So, the two data sets are the same since there is no row in the output of this code snippet here.