# Load the data.table package
library(data.table)
# I am constructing a data.table with my own data
my_DT <- data.table(
my_letters = letters[6:10], # I chose letters 'f' to 'j'
my_numbers = 6:10, # I selected numbers from 6 to 10
my_logical = (6:10) > 8 # I decided to compare if these numbers are greater than 8
)
# I constructed a data.table with columns:
# 'my_letters', 'my_numbers', and 'my_logical'
# I will now check the class of each column
sapply(my_DT, class)
## my_letters my_numbers my_logical
## "character" "integer" "logical"
# 'my_letters' is "character"
# 'my_numbers' is "integer"
# 'my_logical' is "logical"
# I want to read in a CSV file into a data.table
# my_file.csv should be replaced with the actual file path
#my_dt <- fread("my_file.csv")
# I use fread because it reads strings as strings, not as factors
# I am converting an existing data.frame to a data.table in-place
my_DF <- data.frame(
my_letters = letters[6:10], # Again, using letters 'f' to 'j'
my_numbers = 6:10, # Numbers from 6 to 10
my_logical = (6:10) > 8 # Logical comparison greater than 8
)
setDT(my_DF)
# I am not using <- because setDT modifies the data.frame in-place
sapply(my_DF, class)
## my_letters my_numbers my_logical
## "character" "integer" "logical"
# I checked the class, and it's now a data.table
# I am converting a matrix to a data.table
my_matrix <- matrix(1:100, ncol = 10, nrow = 10)
my_DT_from_matrix <- as.data.table(my_matrix)
# Alternatively, I can also use:
# my_DT_from_matrix <- data.table(my_matrix)
# Special symbols in data.table: My customized version with a Medicare dataset
# Load the data.table package
library(data.table)
# I am creating a new dataset related to Medicare
medicare_data <- data.table(
state = rep(c("Florida", "Texas", "California"), each = 5),
beneficiaries = c(100000, 150000, 120000, 130000, 140000, 110000, 160000, 170000, 180000, 190000, 200000, 210000, 220000, 230000, 240000),
average_cost = c(8000, 8200, 8100, 8050, 7900, 8500, 8600, 8700, 8800, 8900, 9000, 9100, 9200, 9300, 9400)
)
# I want to calculate the mean of all numeric columns grouped by 'state'
medicare_data[ , lapply(.SD, mean), by = state]
## state beneficiaries average_cost
## <char> <num> <num>
## 1: Florida 128000 8050
## 2: Texas 162000 8700
## 3: California 220000 9200
# Here, I calculated the mean of 'beneficiaries' and 'average_cost' grouped by 'state'
# I decided to exclude categorical columns and calculate the mean for numeric variables
selected_cols <- c("beneficiaries", "average_cost")
medicare_data[order(state), lapply(.SD, mean), by = state, .SDcols = selected_cols]
## state beneficiaries average_cost
## <char> <num> <num>
## 1: California 220000 9200
## 2: Florida 128000 8050
## 3: Texas 162000 8700
# I arranged the data by 'state' and calculated the mean of selected columns
# To calculate the overall mean without grouping:
medicare_data[ , lapply(.SD, mean), .SDcols = selected_cols]
## beneficiaries average_cost
## <num> <num>
## 1: 170000 8650
# This is how I calculated the mean for the entire dataset for selected numeric variables
# Using .N to count the number of rows in each group by 'state'
medicare_data[, .(count = .N), by = state]
## state count
## <char> <int>
## 1: Florida 5
## 2: Texas 5
## 3: California 5
# I counted the number of rows in each 'state'
# Adding and modifying columns in medicare_data
# Adding a new column for the squared value of 'average_cost'
medicare_data[, avg_cost_sq := average_cost^2]
# I added 'avg_cost_sq' as the square of 'average_cost'
# Removing the avg_cost_sq column
medicare_data[, avg_cost_sq := NULL]
# I removed the 'avg_cost_sq' column as it was no longer needed
# Adding multiple new columns
medicare_data[, `:=`(avg_cost_sq = average_cost^2, beneficiaries_log = log(beneficiaries))]
# I created 'avg_cost_sq' as the square of 'average_cost' and 'beneficiaries_log' as the log of 'beneficiaries'
# Subsetting and editing specific rows
medicare_data[1:3, note := "High priority"]
# I added a 'note' column with "High priority" for the first three rows
# Renaming columns using setnames
setnames(medicare_data, old = "beneficiaries", new = "total_beneficiaries")
# I renamed 'beneficiaries' to 'total_beneficiaries'
# Editing levels of a factor column
medicare_data[, state_factor := factor(state)]
setattr(medicare_data$state_factor, "levels", c("FL", "TX", "CA"))
# I changed the levels of 'state_factor' to abbreviations
# Writing code compatible with both data.frame and data.table: My customized Medicare example
# Load the data.table package
library(data.table)
# I am creating a Medicare dataset
medicare_data <- data.table(
state = rep(c("Florida", "Texas", "California"), each = 5),
beneficiaries = c(100000, 150000, 120000, 130000, 140000, 110000, 160000, 170000, 180000, 190000, 200000, 210000, 220000, 230000, 240000),
average_cost = c(8000, 8200, 8100, 8050, 7900, 8500, 8600, 8700, 8800, 8900, 9000, 9100, 9200, 9300, 9400)
)
# I also convert it to a data.frame to show compatibility
medicare_df <- as.data.frame(medicare_data)
# Subsetting rows
medicare_data[1:3, ]
## state beneficiaries average_cost
## <char> <num> <num>
## 1: Florida 100000 8000
## 2: Florida 150000 8200
## 3: Florida 120000 8100
# I subset the first three rows from the data.table
medicare_df[1:3, ]
## state beneficiaries average_cost
## 1 Florida 100000 8000
## 2 Florida 150000 8200
## 3 Florida 120000 8100
# I subset the first three rows from the data.frame
# Subsetting columns
medicare_data[, c("beneficiaries", "average_cost"), with = FALSE]
## beneficiaries average_cost
## <num> <num>
## 1: 100000 8000
## 2: 150000 8200
## 3: 120000 8100
## 4: 130000 8050
## 5: 140000 7900
## 6: 110000 8500
## 7: 160000 8600
## 8: 170000 8700
## 9: 180000 8800
## 10: 190000 8900
## 11: 200000 9000
## 12: 210000 9100
## 13: 220000 9200
## 14: 230000 9300
## 15: 240000 9400
# I subset the 'beneficiaries' and 'average_cost' columns from the data.table
medicare_df[, c("beneficiaries", "average_cost")]
## beneficiaries average_cost
## 1 100000 8000
## 2 150000 8200
## 3 120000 8100
## 4 130000 8050
## 5 140000 7900
## 6 110000 8500
## 7 160000 8600
## 8 170000 8700
## 9 180000 8800
## 10 190000 8900
## 11 200000 9000
## 12 210000 9100
## 13 220000 9200
## 14 230000 9300
## 15 240000 9400
# I subset the 'beneficiaries' and 'average_cost' columns from the data.frame
# Subsetting with variables
mycols <- c("beneficiaries", "average_cost")
medicare_data[, mycols, with = FALSE]
## beneficiaries average_cost
## <num> <num>
## 1: 100000 8000
## 2: 150000 8200
## 3: 120000 8100
## 4: 130000 8050
## 5: 140000 7900
## 6: 110000 8500
## 7: 160000 8600
## 8: 170000 8700
## 9: 180000 8800
## 10: 190000 8900
## 11: 200000 9000
## 12: 210000 9100
## 13: 220000 9200
## 14: 230000 9300
## 15: 240000 9400
# I used a variable 'mycols' to subset columns from the data.table
medicare_df[, mycols]
## beneficiaries average_cost
## 1 100000 8000
## 2 150000 8200
## 3 120000 8100
## 4 130000 8050
## 5 140000 7900
## 6 110000 8500
## 7 160000 8600
## 8 170000 8700
## 9 180000 8800
## 10 190000 8900
## 11 200000 9000
## 12 210000 9100
## 13 220000 9200
## 14 230000 9300
## 15 240000 9400
# I used the same variable to subset columns from the data.frame
# Subsetting indexed rows (using a condition)
medicare_data[beneficiaries > 150000, ]
## state beneficiaries average_cost
## <char> <num> <num>
## 1: Texas 160000 8600
## 2: Texas 170000 8700
## 3: Texas 180000 8800
## 4: Texas 190000 8900
## 5: California 200000 9000
## 6: California 210000 9100
## 7: California 220000 9200
## 8: California 230000 9300
## 9: California 240000 9400
# I subset rows where 'beneficiaries' are greater than 150,000 from the data.table
medicare_df[medicare_df$beneficiaries > 150000, ]
## state beneficiaries average_cost
## 7 Texas 160000 8600
## 8 Texas 170000 8700
## 9 Texas 180000 8800
## 10 Texas 190000 8900
## 11 California 200000 9000
## 12 California 210000 9100
## 13 California 220000 9200
## 14 California 230000 9300
## 15 California 240000 9400
# I subset rows where 'beneficiaries' are greater than 150,000 from the data.frame
# Getting a single column as a table or vector
single_column_table <- medicare_data[, .(beneficiaries)]
# I extracted 'beneficiaries' as a one-column table from the data.table
single_column_vector <- medicare_df$beneficiaries
# I extracted 'beneficiaries' as a vector from the data.frame
# Getting a single row as a vector
first_row_vector <- unlist(medicare_data[1, ])
# I extracted the first row as a vector from the data.table
first_row_vector_df <- unlist(medicare_df[1, ])
# I extracted the first row as a vector from the data.frame
# Writing code compatible with both data.frame and data.table: My customized version
# Load the data.table package
library(data.table)
# I am creating a matrix, a data.frame, and a data.table
my_matrix <- matrix(rnorm(12), nrow = 4, dimnames = list(letters[1:4], c('A', 'B', 'C')))
my_df <- as.data.frame(my_matrix)
my_dt <- as.data.table(my_matrix)
# I am subsetting the 2nd and 3rd items from the matrix
my_matrix[2:3]
## [1] -2.1377195 -0.4321003
# This is how I return the 2nd and 3rd items, treating 'my_matrix' like a vector
# I am subsetting the 2nd and 3rd columns from the data.frame
my_df[2:3]
## B C
## a -0.9944632 -0.9616720
## b -1.9338497 -0.7455910
## c -2.5517490 -0.2899556
## d 0.1268990 -0.6123586
# This is my way of getting the 2nd and 3rd columns from the data.frame
# I am subsetting the 2nd and 3rd rows from the data.table
my_dt[2:3]
## A B C
## <num> <num> <num>
## 1: -2.1377195 -1.933850 -0.7455910
## 2: -0.4321003 -2.551749 -0.2899556
# Here, I focus on retrieving the 2nd and 3rd rows from the data.table
# To be clear about row subsetting, I use a comma
my_matrix[2:3, ]
## A B C
## b -2.1377195 -1.933850 -0.7455910
## c -0.4321003 -2.551749 -0.2899556
# This returns the 2nd and 3rd rows from the matrix explicitly
my_df[2:3, ]
## A B C
## b -2.1377195 -1.933850 -0.7455910
## c -0.4321003 -2.551749 -0.2899556
# For the data.frame, I return the 2nd and 3rd rows using the comma
my_dt[2:3, ]
## A B C
## <num> <num> <num>
## 1: -2.1377195 -1.933850 -0.7455910
## 2: -0.4321003 -2.551749 -0.2899556
# This is how I explicitly get the 2nd and 3rd rows from the data.table
# Now, I want to subset the 2nd and 3rd columns
my_matrix[, 2:3]
## B C
## a -0.9944632 -0.9616720
## b -1.9338497 -0.7455910
## c -2.5517490 -0.2899556
## d 0.1268990 -0.6123586
# My approach to getting the 2nd and 3rd columns from the matrix
my_df[, 2:3]
## B C
## a -0.9944632 -0.9616720
## b -1.9338497 -0.7455910
## c -2.5517490 -0.2899556
## d 0.1268990 -0.6123586
# For the data.frame, I use this to get the 2nd and 3rd columns
my_dt[, 2:3, with = FALSE]
## B C
## <num> <num>
## 1: -0.9944632 -0.9616720
## 2: -1.9338497 -0.7455910
## 3: -2.5517490 -0.2899556
## 4: 0.1268990 -0.6123586
# I retrieve the 2nd and 3rd columns from the data.table like this
# Subsetting with column names
my_matrix[, c("B", "C")]
## B C
## a -0.9944632 -0.9616720
## b -1.9338497 -0.7455910
## c -2.5517490 -0.2899556
## d 0.1268990 -0.6123586
# Using column names, I get 'B' and 'C' from the matrix
my_df[, c("B", "C")]
## B C
## a -0.9944632 -0.9616720
## b -1.9338497 -0.7455910
## c -2.5517490 -0.2899556
## d 0.1268990 -0.6123586
# For the data.frame, I subset columns 'B' and 'C'
my_dt[, c("B", "C"), with = FALSE]
## B C
## <num> <num>
## 1: -0.9944632 -0.9616720
## 2: -1.9338497 -0.7455910
## 3: -2.5517490 -0.2899556
## 4: 0.1268990 -0.6123586
# In the data.table, I extract columns 'B' and 'C' using their names
# Using a variable to subset columns
my_selected_cols <- 2:3
my_matrix[, my_selected_cols]
## B C
## a -0.9944632 -0.9616720
## b -1.9338497 -0.7455910
## c -2.5517490 -0.2899556
## d 0.1268990 -0.6123586
# Here, I use 'my_selected_cols' to get the 2nd and 3rd columns from the matrix
my_df[, my_selected_cols]
## B C
## a -0.9944632 -0.9616720
## b -1.9338497 -0.7455910
## c -2.5517490 -0.2899556
## d 0.1268990 -0.6123586
# In the data.frame, I use 'my_selected_cols' for the same purpose
my_dt[, my_selected_cols, with = FALSE]
## B C
## <num> <num>
## 1: -0.9944632 -0.9616720
## 2: -1.9338497 -0.7455910
## 3: -2.5517490 -0.2899556
## 4: 0.1268990 -0.6123586
# This is how I achieve it in the data.table
# Subsetting rows based on a condition
my_dt[my_dt$A > 0, ]
## Empty data.table (0 rows and 3 cols): A,B,C
# I filter rows in the data.table where 'A' has values greater than 0
my_df[my_df$A > 0, ]
## [1] A B C
## <0 rows> (or 0-length row.names)
# Similarly, I filter rows in the data.frame where 'A' is greater than 0
# Selecting a single column as a table or vector
single_col_table_dt <- my_dt[, .(B)]
# I create a one-column table from the data.table for column 'B'
single_col_vector_df <- my_df$B
# From the data.frame, I extract 'B' as a vector
# Getting a single row as a vector
first_row_vector_dt <- unlist(my_dt[1, ])
# I convert the first row of the data.table into a vector
first_row_vector_df <- unlist(my_df[1, ])
# Similarly, I convert the first row of the data.frame into a vector
# Setting keys and indices in data.table: My personalized example
# Load the data.table package
library(data.table)
# I am creating a data.table with new data
my_DT <- data.table(
product = letters[6:10],
price = c(200, 150, 300, 100, 250),
in_stock = c(TRUE, TRUE, FALSE, TRUE, FALSE)
)
# Let's view my data.table
# my_DT
# This shows the initial data.table structure with 'product', 'price', and 'in_stock'
# Setting the key for efficient operations based on 'price'
setkey(my_DT, price)
# I can check the key using the tables() function
tables()
## NAME NROW NCOL MB COLS KEY
## 1: medicare_data 15 3 0 state,beneficiaries,average_cost [NULL]
## 2: my_DF 5 3 0 my_letters,my_numbers,my_logical [NULL]
## 3: my_dt 4 3 0 A,B,C [NULL]
## 4: my_DT 5 3 0 product,price,in_stock price
## 5: my_DT_from_matrix 10 10 0 V1,V2,V3,V4,V5,V6,... [NULL]
## 6: single_col_table_dt 4 1 0 B [NULL]
## 7: single_column_table 15 1 0 beneficiaries [NULL]
## Total: 0MB using type_size
# This displays the metadata, showing 'price' is set as the key
# Viewing my data.table after setting the key
# my_DT
# The data.table is now sorted by 'price' because 'price' is the key
# Now, I'll use the new "on=" feature for fast indexing without relying on keys
# Setting a secondary index on 'product'
setindex(my_DT, product)
# I check which indices have been set
indices(my_DT)
## [1] "product"
# This confirms that 'product' has been set as a secondary index