Find and get a dataset from the datasets available within R.
Perform exploratory data analysis (EDA) and prepare a codebook on that dataset using a newer method in R
library(explore)
library(dplyr)
data(mtcars)
mtcars %>% explore_tbl()
mtcars %>% explore_all()
mtcars %>% explore(am)
mtcars %>% explore(mpg, target=am)
mtcars %>% explore(drat, mpg, target=qsec)
mtcars %>% select(gear, disp, hp, cyl ,qsec) %>% explore_all(target = gear)
mtcars %>% explain_tree(target = am)
mtcars %>% explain_tree(target = wt)
mtcars %>% report(output_file = "AAQ2_report.html", output_dir = tempdir())
##
##
## processing file: template_report_variable.Rmd
##
|
| | 0%
|
|........ | 11%
## ordinary text without R code
##
##
|
|................ | 22%
## label: unnamed-chunk-1 (with options)
## List of 4
## $ fig.height: num 3
## $ echo : logi FALSE
## $ message : logi FALSE
## $ warning : logi FALSE
##
|
|....................... | 33%
## ordinary text without R code
##
##
|
|............................... | 44%
## label: unnamed-chunk-2 (with options)
## List of 3
## $ echo : logi FALSE
## $ message: logi FALSE
## $ warning: logi FALSE
##
##
|
|....................................... | 56%
## ordinary text without R code
##
##
|
|............................................... | 67%
## label: unnamed-chunk-3 (with options)
## List of 1
## $ include: logi FALSE
##
##
|
|...................................................... | 78%
## ordinary text without R code
##
##
|
|.............................................................. | 89%
## label: unnamed-chunk-4 (with options)
## List of 5
## $ echo : logi FALSE
## $ fig.height: num 18
## $ fig.width : num 10
## $ message : logi FALSE
## $ warning : logi FALSE
##
|
|......................................................................| 100%
## ordinary text without R code
## output file: C:/Users/Hui Xin/AppData/Local/Temp/Rtmp8shWh3/template_report_variable.knit.md
## "C:/Program Files/RStudio/bin/quarto/bin/pandoc" +RTS -K512m -RTS "C:/Users/Hui Xin/AppData/Local/Temp/Rtmp8shWh3/template_report_variable.knit.md" --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc56c42a241dd7.html --lua-filter "C:\Users\HUIXIN~1\DOCUME~1\R\WIN-LI~1\4.1\RMARKD~1\RMARKD~1\lua\PAGEBR~1.LUA" --lua-filter "C:\Users\HUIXIN~1\DOCUME~1\R\WIN-LI~1\4.1\RMARKD~1\RMARKD~1\lua\LATEX-~1.LUA" --self-contained --variable bs3=TRUE --standalone --section-divs --template "C:\Users\HUIXIN~1\DOCUME~1\R\WIN-LI~1\4.1\RMARKD~1\rmd\h\DEFAUL~1.HTM" --no-highlight --variable highlightjs=1 --variable theme=bootstrap --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\HUIXIN~1\AppData\Local\Temp\Rtmp8shWh3\rmarkdown-str56c44d9c6f6a.html"
##
## Output created: C:/Users/Hui Xin/AppData/Local/Temp/Rtmp8shWh3/AAQ2_report.html
sjlabelled library to read & write item
labels & values.tibble to extract labels from dataset.psych to do descriptive analysis.library(sjlabelled)
library(expss)
library(tibble)
library(psych)
Check labels of all variables.
get_labels(mtcars)
## $mpg
## NULL
##
## $cyl
## NULL
##
## $disp
## NULL
##
## $hp
## NULL
##
## $drat
## NULL
##
## $wt
## NULL
##
## $qsec
## NULL
##
## $vs
## NULL
##
## $am
## NULL
##
## $gear
## NULL
##
## $carb
## NULLAdd labels to all variables that are not labelled.
mtcars = apply_labels(mtcars,
mpg = "Miles/(US) gallon",
cyl = "Number of cylinders",
disp = "Displacement (cu.in.)",
hp = "Gross horsepower",
drat = "Rear axle ratio",
wt = "Weight (1000 lbs)",
qsec = "1/4 mile time",
vs = "Engine",
vs = c("V-engine" = 0,
"Straight engine" = 1),
am = "Transmission",
am = c("Automatic" = 0,
"Manual"=1),
gear = "Number of forward gears",
carb = "Number of carburetors"
)
get_labels(mtcars)
## $mpg
## NULL
##
## $cyl
## NULL
##
## $disp
## NULL
##
## $hp
## NULL
##
## $drat
## NULL
##
## $wt
## NULL
##
## $qsec
## NULL
##
## $vs
## [1] "V-engine" "Straight engine"
##
## $am
## [1] "Automatic" "Manual"
##
## $gear
## NULL
##
## $carb
## NULLExtract the variables and store in new object.
mycodebook <- enframe(get_label(mtcars))Add column names to codebook.
colnames(mycodebook) <- c("variable_id", "item_text")Get descriptive statistics and select those of interest.
descriptives <- mtcars %>% describe() %>% as_tibble() %>% select("n","min","max","mean")Add stats to the codebook.
mycodebook <- cbind(mycodebook, descriptives)Codebook is done.
mycodebook
## variable_id item_text n min max mean
## 1 mpg Miles/(US) gallon 32 10.400 33.900 20.090625
## 2 cyl Number of cylinders 32 4.000 8.000 6.187500
## 3 disp Displacement (cu.in.) 32 71.100 472.000 230.721875
## 4 hp Gross horsepower 32 52.000 335.000 146.687500
## 5 drat Rear axle ratio 32 2.760 4.930 3.596563
## 6 wt Weight (1000 lbs) 32 1.513 5.424 3.217250
## 7 qsec 1/4 mile time 32 14.500 22.900 17.848750
## 8 vs Engine 32 0.000 1.000 0.437500
## 9 am Transmission 32 0.000 1.000 0.406250
## 10 gear Number of forward gears 32 3.000 5.000 3.687500
## 11 carb Number of carburetors 32 1.000 8.000 2.812500Demonstrate these FIVE (5) functions of dplyr for data manipulation
filter()
This function is to filter the data based on required conditions, such as value less than 100 or value equals to “abc”.
The code below filter the nutri dataset to get the rows with Protein > 1 and Fiber > 2
filter(nutri, Protein > 1 & Fiber > 2)
## # A tibble: 6 x 7
## ...1 Calories Fat Carb Fiber Protein Sodium
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Hot Chocolate 320 9 47 4 14 160
## 2 Starbucks® Signature Hot Chocolate 430 26 45 5 12 115
## 3 Caffè Mocha 290 8 42 4 13 140
## 4 Iced Caffè Mocha 230 6 36 4 9 90
## 5 Chocolate Smoothie 320 5 53 8 20 170
## 6 Strawberry Smoothie 300 2 60 7 16 130arrange()
This function is to sort the dataset based on wanted sequence of values in a variable. For example, sort a dataset in descending order of ID.
The code below sort the nutri dataset in ascending order of Calories and descending order of Protein.
arrange(nutri, Calories, desc(Protein))
## # A tibble: 177 x 7
## ...1 Calories Fat Carb Fiber Protein Sodium
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Iced Coffee 0 0 0 0 0 0
## 2 Nariño 70 Cold Brew with Milk 0 0 0 0 0 0
## 3 Blonde Roast 5 0 0 0 1 10
## 4 Decaf Pike Place® Roast 5 0 0 0 1 10
## 5 Featured Dark Roast 5 0 0 0 1 10
## 6 Pike Place® Roast 5 0 0 0 1 10
## 7 Iced Coffee 5 0 0 0 0 5
## 8 Nariño 70 Cold Brew 5 0 0 0 0 15
## 9 Nitro Cold Brew 5 0 0 0 0 10
## 10 Starbucks® Iced Coffee Unsweetened 10 0 2 0 1 0
## # ... with 167 more rowsmutate()
This function is to add new column(s) to the dataset. The column can be added included ranking, mean, max, modifying values from the available columns etc.
The code below add a new column called Rank that display the ranking of food based on Calories. By default, 1 is the lowest ranking.
mutate(nutri, Rank = min_rank(Calories))
## # A tibble: 177 x 8
## ...1 Calories Fat Carb Fiber Protein Sodium Rank
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 Cool Lime Starbucks Refreshe~ 45 0 11 0 0 10 15
## 2 Ombré Pink Drink NA NA NA NA NA NA NA
## 3 Pink Drink NA NA NA NA NA NA NA
## 4 Strawberry Acai Starbucks Re~ 80 0 18 1 0 10 30
## 5 Very Berry Hibiscus Starbuck~ 60 0 14 1 0 10 18
## 6 Violet Drink NA NA NA NA NA NA NA
## 7 Evolution Fresh™ Cold-Presse~ NA NA NA NA NA NA NA
## 8 Evolution Fresh™ Defense Up NA NA NA NA NA NA NA
## 9 Evolution Fresh™ Organic Gin~ 110 0 28 0 0 5 35
## 10 Iced Coffee 0 0 0 0 0 0 1
## # ... with 167 more rowsselect()
This function is the display the desired columns of data. The difference with filter() is that select() choose based on variable names.
The code below selected the columns of Fat, Protein and Fiber.
select(nutri, c("Fat", "Protein", "Fiber"))
## # A tibble: 177 x 3
## Fat Protein Fiber
## <dbl> <dbl> <dbl>
## 1 0 0 0
## 2 NA NA NA
## 3 NA NA NA
## 4 0 0 1
## 5 0 0 1
## 6 NA NA NA
## 7 NA NA NA
## 8 NA NA NA
## 9 0 0 0
## 10 0 0 0
## # ... with 167 more rowssummarise()
This function is to get the summary of data, such as n, mean, percentile.
The code below summarised the mean, median and max value of Calories.
summarise(nutri, Cal_Mean = mean(Calories, na.rm = TRUE), Cal_Median = median(Calories, na.rm = TRUE), Max_cal = max(Calories, na.rm = TRUE))
## # A tibble: 1 x 3
## Cal_Mean Cal_Median Max_cal
## <dbl> <dbl> <dbl>
## 1 135. 130 430