library(rmarkdown)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.5 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.9
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(readr)
#2. Get the current working directory
getwd()
## [1] "/Users/faiz/Downloads/BDA_POP/KLU/Session2/DataAnalytics"
#3. Import dataset #4. Reading a CSV file from a local path using fastest method
#Compare read.csv , read.table, readr (read_delim) , fread
read.csv.timing <- system.time(read.csv("questions.csv", header = TRUE, sep = ","))
read.table.timing <- system.time(read.table("questions.csv", header = TRUE, sep = ","))
readr.timing <- system.time(read_delim("questions.csv", ",", col_names = TRUE))
## Rows: 17203824 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): Id, Score, OwnerUserId, AnswerCount
## dttm (3): CreationDate, ClosedDate, DeletionDate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data.table.timing <- system.time(allData <- fread("questions.csv", showProgress = FALSE))
dataS <- data.frame(method = c('read.csv','read.table', 'readr', 'fread'),
timing = c(read.csv.timing[3],read.table.timing[3], readr.timing[3], data.table.timing[3]))
dataS
readr.timing <- system.time(read_delim("questions.csv", ",", col_names = TRUE))
## Rows: 17203824 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): Id, Score, OwnerUserId, AnswerCount
## dttm (3): CreationDate, ClosedDate, DeletionDate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data.table.timing <- system.time(allData <- fread("questions.csv", showProgress = FALSE))
#5. Compare of timing of each method
data <- data.frame(method = c( 'readr', 'fread'),
timing = c( readr.timing[3], data.table.timing[3]))
data
#6.Get sample data from the table
head(allData)