dsa_406_001_sp25_wk4proj

#This is what ChatGPT outputted for me when I gave it this prompt: "You are a data scientist exploring a large data set using exploratory data analysis principles. Your task is to create code in R using data.table functions to look at the variable types in the data set, create a new variable using variables from the data set, and exclude certain values from the data set. Also, include performance comparisons with equivalent dplyr operations."

# Load required package
library(data.table)
library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:data.table':

    between, first, last

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(microbenchmark)

Warning: package 'microbenchmark' was built under R version 4.3.3

# Sample dataset
dt <- data.table(
  ID = 1:10,
  Age = c(25, 30, 22, 40, 35, 28, 32, 27, 29, 31),
  Salary = c(50000, 60000, 45000, 80000, 75000, 52000, 67000, 49000, 56000, 62000),
  Department = c("HR", "IT", "Finance", "IT", "HR", "Finance", "HR", "IT", "Finance", "HR")
)

# Create equivalent data in dplyr
df <- as.data.frame(dt)

# 1. Check variable types
print(sapply(dt, class))  # data.table

         ID         Age      Salary  Department 
  "integer"   "numeric"   "numeric" "character"

print(sapply(df, class))  # dplyr (base R)

         ID         Age      Salary  Department 
  "integer"   "numeric"   "numeric" "character"

# 2. Create a new variable (e.g., "Experience_Level" based on Age)
dt[, Experience_Level := ifelse(Age < 30, "Junior", "Senior")]

df <- df %>%
  mutate(Experience_Level = ifelse(Age < 30, "Junior", "Senior"))

# 3. Exclude certain values (e.g., exclude employees from the IT department)
dt_filtered <- dt[Department != "IT"]
df_filtered <- df %>%
  filter(Department != "IT")

# 4. Performance comparison using microbenchmark
benchmark_results <- microbenchmark(
  data_table_create = dt[, Experience_Level := ifelse(Age < 30, "Junior", "Senior")],
  dplyr_create = df %>% mutate(Experience_Level = ifelse(Age < 30, "Junior", "Senior")),
  data_table_filter = dt[Department != "IT"],
  dplyr_filter = df %>% filter(Department != "IT"),
  times = 1000
)

Warning in microbenchmark(data_table_create = dt[, `:=`(Experience_Level, :
less accurate nanosecond times to avoid potential integer overflows

# Print results
print(dt)

    ID Age Salary Department Experience_Level
 1:  1  25  50000         HR           Junior
 2:  2  30  60000         IT           Senior
 3:  3  22  45000    Finance           Junior
 4:  4  40  80000         IT           Senior
 5:  5  35  75000         HR           Senior
 6:  6  28  52000    Finance           Junior
 7:  7  32  67000         HR           Senior
 8:  8  27  49000         IT           Junior
 9:  9  29  56000    Finance           Junior
10: 10  31  62000         HR           Senior

print(dt_filtered)

   ID Age Salary Department Experience_Level
1:  1  25  50000         HR           Junior
2:  3  22  45000    Finance           Junior
3:  5  35  75000         HR           Senior
4:  6  28  52000    Finance           Junior
5:  7  32  67000         HR           Senior
6:  9  29  56000    Finance           Junior
7: 10  31  62000         HR           Senior

print(df_filtered)

  ID Age Salary Department Experience_Level
1  1  25  50000         HR           Junior
2  3  22  45000    Finance           Junior
3  5  35  75000         HR           Senior
4  6  28  52000    Finance           Junior
5  7  32  67000         HR           Senior
6  9  29  56000    Finance           Junior
7 10  31  62000         HR           Senior

print(benchmark_results)

Unit: microseconds
              expr     min       lq      mean   median       uq      max neval
 data_table_create 170.765 193.3970 232.80792 205.6560 216.0495 6885.294  1000
      dplyr_create 451.656 474.2675 529.65087 487.5515 504.3615 6380.543  1000
 data_table_filter  59.245  74.7020  92.29727  81.5900  86.6125 5133.200  1000
      dplyr_filter 409.057 436.1375 481.15759 448.4170 465.7805 5051.487  1000
  cld
 a   
  b  
   c 
    d