1. Import libraries

library(rmarkdown)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.9     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/

library(data.table)

## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose

library(readr)

#2. Get the current working directory

getwd()

## [1] "/Users/faiz/Downloads/BDA_POP/KLU/Session2/DataAnalytics"

#3. Import dataset #4. Reading a CSV file from a local path using fastest method

#Compare read.csv , read.table, readr (read_delim) , fread

read.csv.timing <- system.time(read.csv("questions.csv", header = TRUE, sep = ","))
read.table.timing <- system.time(read.table("questions.csv", header = TRUE, sep = ","))
readr.timing <- system.time(read_delim("questions.csv", ",", col_names = TRUE))

## Rows: 17203824 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (4): Id, Score, OwnerUserId, AnswerCount
## dttm (3): CreationDate, ClosedDate, DeletionDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data.table.timing <- system.time(allData <- fread("questions.csv", showProgress = FALSE))
dataS <- data.frame(method = c('read.csv','read.table', 'readr', 'fread'), 
                  timing = c(read.csv.timing[3],read.table.timing[3], readr.timing[3], data.table.timing[3]))
dataS

readr.timing <- system.time(read_delim("questions.csv", ",", col_names = TRUE))

## Rows: 17203824 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (4): Id, Score, OwnerUserId, AnswerCount
## dttm (3): CreationDate, ClosedDate, DeletionDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data.table.timing <- system.time(allData <- fread("questions.csv", showProgress = FALSE))

#5. Compare of timing of each method

data <- data.frame(method = c( 'readr', 'fread'), 
                  timing = c( readr.timing[3], data.table.timing[3]))
data

#6.Get sample data from the table

head(allData)

Data Analytics on Stackoverflow

1. Import libraries