data.table

# Load the data.table package
library(data.table)

# I am constructing a data.table with my own data
my_DT <- data.table(
  my_letters = letters[6:10],   # I chose letters 'f' to 'j'
  my_numbers = 6:10,            # I selected numbers from 6 to 10
  my_logical = (6:10) > 8       # I decided to compare if these numbers are greater than 8
)

# I constructed a data.table with columns: 
# 'my_letters', 'my_numbers', and 'my_logical'

# I will now check the class of each column
sapply(my_DT, class)

##  my_letters  my_numbers  my_logical 
## "character"   "integer"   "logical"

# 'my_letters' is "character"
# 'my_numbers' is "integer"
# 'my_logical' is "logical"

# I want to read in a CSV file into a data.table
# my_file.csv should be replaced with the actual file path
#my_dt <- fread("my_file.csv")
# I use fread because it reads strings as strings, not as factors

# I am converting an existing data.frame to a data.table in-place
my_DF <- data.frame(
  my_letters = letters[6:10],   # Again, using letters 'f' to 'j'
  my_numbers = 6:10,            # Numbers from 6 to 10
  my_logical = (6:10) > 8       # Logical comparison greater than 8
)

setDT(my_DF)
# I am not using <- because setDT modifies the data.frame in-place
sapply(my_DF, class)

##  my_letters  my_numbers  my_logical 
## "character"   "integer"   "logical"

# I checked the class, and it's now a data.table

# I am converting a matrix to a data.table
my_matrix <- matrix(1:100, ncol = 10, nrow = 10)
my_DT_from_matrix <- as.data.table(my_matrix)
# Alternatively, I can also use:
# my_DT_from_matrix <- data.table(my_matrix)

# Special symbols in data.table: My customized version with a Medicare dataset

# Load the data.table package
library(data.table)

# I am creating a new dataset related to Medicare
medicare_data <- data.table(
  state = rep(c("Florida", "Texas", "California"), each = 5),
  beneficiaries = c(100000, 150000, 120000, 130000, 140000, 110000, 160000, 170000, 180000, 190000, 200000, 210000, 220000, 230000, 240000),
  average_cost = c(8000, 8200, 8100, 8050, 7900, 8500, 8600, 8700, 8800, 8900, 9000, 9100, 9200, 9300, 9400)
)

# I want to calculate the mean of all numeric columns grouped by 'state'
medicare_data[ , lapply(.SD, mean), by = state]

##         state beneficiaries average_cost
##        <char>         <num>        <num>
## 1:    Florida        128000         8050
## 2:      Texas        162000         8700
## 3: California        220000         9200

# Here, I calculated the mean of 'beneficiaries' and 'average_cost' grouped by 'state'

# I decided to exclude categorical columns and calculate the mean for numeric variables
selected_cols <- c("beneficiaries", "average_cost")
medicare_data[order(state), lapply(.SD, mean), by = state, .SDcols = selected_cols]

##         state beneficiaries average_cost
##        <char>         <num>        <num>
## 1: California        220000         9200
## 2:    Florida        128000         8050
## 3:      Texas        162000         8700

# I arranged the data by 'state' and calculated the mean of selected columns

# To calculate the overall mean without grouping:
medicare_data[ , lapply(.SD, mean), .SDcols = selected_cols]

##    beneficiaries average_cost
##            <num>        <num>
## 1:        170000         8650

# This is how I calculated the mean for the entire dataset for selected numeric variables

# Using .N to count the number of rows in each group by 'state'
medicare_data[, .(count = .N), by = state]

##         state count
##        <char> <int>
## 1:    Florida     5
## 2:      Texas     5
## 3: California     5

# I counted the number of rows in each 'state'

# Adding and modifying columns in medicare_data

# Adding a new column for the squared value of 'average_cost'
medicare_data[, avg_cost_sq := average_cost^2]
# I added 'avg_cost_sq' as the square of 'average_cost'

# Removing the avg_cost_sq column
medicare_data[, avg_cost_sq := NULL]
# I removed the 'avg_cost_sq' column as it was no longer needed

# Adding multiple new columns
medicare_data[, `:=`(avg_cost_sq = average_cost^2, beneficiaries_log = log(beneficiaries))]
# I created 'avg_cost_sq' as the square of 'average_cost' and 'beneficiaries_log' as the log of 'beneficiaries'

# Subsetting and editing specific rows
medicare_data[1:3, note := "High priority"]
# I added a 'note' column with "High priority" for the first three rows

# Renaming columns using setnames
setnames(medicare_data, old = "beneficiaries", new = "total_beneficiaries")
# I renamed 'beneficiaries' to 'total_beneficiaries'

# Editing levels of a factor column
medicare_data[, state_factor := factor(state)]
setattr(medicare_data$state_factor, "levels", c("FL", "TX", "CA"))
# I changed the levels of 'state_factor' to abbreviations

# Writing code compatible with both data.frame and data.table: My customized Medicare example

# Load the data.table package
library(data.table)

# I am creating a Medicare dataset
medicare_data <- data.table(
  state = rep(c("Florida", "Texas", "California"), each = 5),
  beneficiaries = c(100000, 150000, 120000, 130000, 140000, 110000, 160000, 170000, 180000, 190000, 200000, 210000, 220000, 230000, 240000),
  average_cost = c(8000, 8200, 8100, 8050, 7900, 8500, 8600, 8700, 8800, 8900, 9000, 9100, 9200, 9300, 9400)
)

# I also convert it to a data.frame to show compatibility
medicare_df <- as.data.frame(medicare_data)

# Subsetting rows
medicare_data[1:3, ]

##      state beneficiaries average_cost
##     <char>         <num>        <num>
## 1: Florida        100000         8000
## 2: Florida        150000         8200
## 3: Florida        120000         8100

# I subset the first three rows from the data.table

medicare_df[1:3, ]

##     state beneficiaries average_cost
## 1 Florida        100000         8000
## 2 Florida        150000         8200
## 3 Florida        120000         8100

# I subset the first three rows from the data.frame

# Subsetting columns
medicare_data[, c("beneficiaries", "average_cost"), with = FALSE]

##     beneficiaries average_cost
##             <num>        <num>
##  1:        100000         8000
##  2:        150000         8200
##  3:        120000         8100
##  4:        130000         8050
##  5:        140000         7900
##  6:        110000         8500
##  7:        160000         8600
##  8:        170000         8700
##  9:        180000         8800
## 10:        190000         8900
## 11:        200000         9000
## 12:        210000         9100
## 13:        220000         9200
## 14:        230000         9300
## 15:        240000         9400

# I subset the 'beneficiaries' and 'average_cost' columns from the data.table

medicare_df[, c("beneficiaries", "average_cost")]

##    beneficiaries average_cost
## 1         100000         8000
## 2         150000         8200
## 3         120000         8100
## 4         130000         8050
## 5         140000         7900
## 6         110000         8500
## 7         160000         8600
## 8         170000         8700
## 9         180000         8800
## 10        190000         8900
## 11        200000         9000
## 12        210000         9100
## 13        220000         9200
## 14        230000         9300
## 15        240000         9400

# I subset the 'beneficiaries' and 'average_cost' columns from the data.frame

# Subsetting with variables
mycols <- c("beneficiaries", "average_cost")
medicare_data[, mycols, with = FALSE]

##     beneficiaries average_cost
##             <num>        <num>
##  1:        100000         8000
##  2:        150000         8200
##  3:        120000         8100
##  4:        130000         8050
##  5:        140000         7900
##  6:        110000         8500
##  7:        160000         8600
##  8:        170000         8700
##  9:        180000         8800
## 10:        190000         8900
## 11:        200000         9000
## 12:        210000         9100
## 13:        220000         9200
## 14:        230000         9300
## 15:        240000         9400

# I used a variable 'mycols' to subset columns from the data.table

medicare_df[, mycols]

##    beneficiaries average_cost
## 1         100000         8000
## 2         150000         8200
## 3         120000         8100
## 4         130000         8050
## 5         140000         7900
## 6         110000         8500
## 7         160000         8600
## 8         170000         8700
## 9         180000         8800
## 10        190000         8900
## 11        200000         9000
## 12        210000         9100
## 13        220000         9200
## 14        230000         9300
## 15        240000         9400

# I used the same variable to subset columns from the data.frame

# Subsetting indexed rows (using a condition)
medicare_data[beneficiaries > 150000, ]

##         state beneficiaries average_cost
##        <char>         <num>        <num>
## 1:      Texas        160000         8600
## 2:      Texas        170000         8700
## 3:      Texas        180000         8800
## 4:      Texas        190000         8900
## 5: California        200000         9000
## 6: California        210000         9100
## 7: California        220000         9200
## 8: California        230000         9300
## 9: California        240000         9400

# I subset rows where 'beneficiaries' are greater than 150,000 from the data.table

medicare_df[medicare_df$beneficiaries > 150000, ]

##         state beneficiaries average_cost
## 7       Texas        160000         8600
## 8       Texas        170000         8700
## 9       Texas        180000         8800
## 10      Texas        190000         8900
## 11 California        200000         9000
## 12 California        210000         9100
## 13 California        220000         9200
## 14 California        230000         9300
## 15 California        240000         9400

# I subset rows where 'beneficiaries' are greater than 150,000 from the data.frame

# Getting a single column as a table or vector
single_column_table <- medicare_data[, .(beneficiaries)]
# I extracted 'beneficiaries' as a one-column table from the data.table

single_column_vector <- medicare_df$beneficiaries
# I extracted 'beneficiaries' as a vector from the data.frame

# Getting a single row as a vector
first_row_vector <- unlist(medicare_data[1, ])
# I extracted the first row as a vector from the data.table

first_row_vector_df <- unlist(medicare_df[1, ])
# I extracted the first row as a vector from the data.frame

# Writing code compatible with both data.frame and data.table: My customized version

# Load the data.table package
library(data.table)

# I am creating a matrix, a data.frame, and a data.table
my_matrix <- matrix(rnorm(12), nrow = 4, dimnames = list(letters[1:4], c('A', 'B', 'C')))
my_df <- as.data.frame(my_matrix)
my_dt <- as.data.table(my_matrix)

# I am subsetting the 2nd and 3rd items from the matrix
my_matrix[2:3]

## [1] -2.1377195 -0.4321003

# This is how I return the 2nd and 3rd items, treating 'my_matrix' like a vector

# I am subsetting the 2nd and 3rd columns from the data.frame
my_df[2:3]

##            B          C
## a -0.9944632 -0.9616720
## b -1.9338497 -0.7455910
## c -2.5517490 -0.2899556
## d  0.1268990 -0.6123586

# This is my way of getting the 2nd and 3rd columns from the data.frame

# I am subsetting the 2nd and 3rd rows from the data.table
my_dt[2:3]

##             A         B          C
##         <num>     <num>      <num>
## 1: -2.1377195 -1.933850 -0.7455910
## 2: -0.4321003 -2.551749 -0.2899556

# Here, I focus on retrieving the 2nd and 3rd rows from the data.table

# To be clear about row subsetting, I use a comma
my_matrix[2:3, ]

##            A         B          C
## b -2.1377195 -1.933850 -0.7455910
## c -0.4321003 -2.551749 -0.2899556

# This returns the 2nd and 3rd rows from the matrix explicitly

my_df[2:3, ]

##            A         B          C
## b -2.1377195 -1.933850 -0.7455910
## c -0.4321003 -2.551749 -0.2899556

# For the data.frame, I return the 2nd and 3rd rows using the comma

my_dt[2:3, ]

##             A         B          C
##         <num>     <num>      <num>
## 1: -2.1377195 -1.933850 -0.7455910
## 2: -0.4321003 -2.551749 -0.2899556

# This is how I explicitly get the 2nd and 3rd rows from the data.table

# Now, I want to subset the 2nd and 3rd columns
my_matrix[, 2:3]

##            B          C
## a -0.9944632 -0.9616720
## b -1.9338497 -0.7455910
## c -2.5517490 -0.2899556
## d  0.1268990 -0.6123586

# My approach to getting the 2nd and 3rd columns from the matrix

my_df[, 2:3]

##            B          C
## a -0.9944632 -0.9616720
## b -1.9338497 -0.7455910
## c -2.5517490 -0.2899556
## d  0.1268990 -0.6123586

# For the data.frame, I use this to get the 2nd and 3rd columns

my_dt[, 2:3, with = FALSE]

##             B          C
##         <num>      <num>
## 1: -0.9944632 -0.9616720
## 2: -1.9338497 -0.7455910
## 3: -2.5517490 -0.2899556
## 4:  0.1268990 -0.6123586

# I retrieve the 2nd and 3rd columns from the data.table like this

# Subsetting with column names
my_matrix[, c("B", "C")]

##            B          C
## a -0.9944632 -0.9616720
## b -1.9338497 -0.7455910
## c -2.5517490 -0.2899556
## d  0.1268990 -0.6123586

# Using column names, I get 'B' and 'C' from the matrix

my_df[, c("B", "C")]

##            B          C
## a -0.9944632 -0.9616720
## b -1.9338497 -0.7455910
## c -2.5517490 -0.2899556
## d  0.1268990 -0.6123586

# For the data.frame, I subset columns 'B' and 'C'

my_dt[, c("B", "C"), with = FALSE]

##             B          C
##         <num>      <num>
## 1: -0.9944632 -0.9616720
## 2: -1.9338497 -0.7455910
## 3: -2.5517490 -0.2899556
## 4:  0.1268990 -0.6123586

# In the data.table, I extract columns 'B' and 'C' using their names

# Using a variable to subset columns
my_selected_cols <- 2:3
my_matrix[, my_selected_cols]

##            B          C
## a -0.9944632 -0.9616720
## b -1.9338497 -0.7455910
## c -2.5517490 -0.2899556
## d  0.1268990 -0.6123586

# Here, I use 'my_selected_cols' to get the 2nd and 3rd columns from the matrix

my_df[, my_selected_cols]

##            B          C
## a -0.9944632 -0.9616720
## b -1.9338497 -0.7455910
## c -2.5517490 -0.2899556
## d  0.1268990 -0.6123586

# In the data.frame, I use 'my_selected_cols' for the same purpose

my_dt[, my_selected_cols, with = FALSE]

##             B          C
##         <num>      <num>
## 1: -0.9944632 -0.9616720
## 2: -1.9338497 -0.7455910
## 3: -2.5517490 -0.2899556
## 4:  0.1268990 -0.6123586

# This is how I achieve it in the data.table

# Subsetting rows based on a condition
my_dt[my_dt$A > 0, ]

## Empty data.table (0 rows and 3 cols): A,B,C

# I filter rows in the data.table where 'A' has values greater than 0

my_df[my_df$A > 0, ]

## [1] A B C
## <0 rows> (or 0-length row.names)

# Similarly, I filter rows in the data.frame where 'A' is greater than 0

# Selecting a single column as a table or vector
single_col_table_dt <- my_dt[, .(B)]
# I create a one-column table from the data.table for column 'B'

single_col_vector_df <- my_df$B
# From the data.frame, I extract 'B' as a vector

# Getting a single row as a vector
first_row_vector_dt <- unlist(my_dt[1, ])
# I convert the first row of the data.table into a vector

first_row_vector_df <- unlist(my_df[1, ])
# Similarly, I convert the first row of the data.frame into a vector

# Setting keys and indices in data.table: My personalized example

# Load the data.table package
library(data.table)

# I am creating a data.table with new data
my_DT <- data.table(
  product = letters[6:10],
  price = c(200, 150, 300, 100, 250),
  in_stock = c(TRUE, TRUE, FALSE, TRUE, FALSE)
)

# Let's view my data.table
# my_DT
# This shows the initial data.table structure with 'product', 'price', and 'in_stock'

# Setting the key for efficient operations based on 'price'
setkey(my_DT, price)

# I can check the key using the tables() function
tables()

##                   NAME NROW NCOL MB                             COLS    KEY
## 1:       medicare_data   15    3  0 state,beneficiaries,average_cost [NULL]
## 2:               my_DF    5    3  0 my_letters,my_numbers,my_logical [NULL]
## 3:               my_dt    4    3  0                            A,B,C [NULL]
## 4:               my_DT    5    3  0           product,price,in_stock  price
## 5:   my_DT_from_matrix   10   10  0            V1,V2,V3,V4,V5,V6,... [NULL]
## 6: single_col_table_dt    4    1  0                                B [NULL]
## 7: single_column_table   15    1  0                    beneficiaries [NULL]
## Total: 0MB using type_size

# This displays the metadata, showing 'price' is set as the key

# Viewing my data.table after setting the key
# my_DT
# The data.table is now sorted by 'price' because 'price' is the key

# Now, I'll use the new "on=" feature for fast indexing without relying on keys
# Setting a secondary index on 'product'
setindex(my_DT, product)

# I check which indices have been set
indices(my_DT)

## [1] "product"

# This confirms that 'product' has been set as a secondary index

data.table

Avery Holloman

2024-11-05