8 useful packages

Useful R Packages for Data Science You Aren??t Using (But Should!)

[SITE] https://github.com/dcomtois/summarytools

packages

library(tidyverse)      # purrr
library(summarytools)
library(magrittr)
library(DataExplorer)   # DataExplorer
library(esquisse)       # ggplot2 builder
library(mlr)            # Machine Learning
library(parsnip)
library(ranger)
library(fueleconomy)
library(GGally)

purrr & tidyverse

data(mtcars)
mtcars %>%
  split(.$cyl) %>% #selecting cylinder to create three sets of data using the cyl values
  map(~ lm(mpg ~ wt, data = .)) %>%
  map(summary) %>%
  map_dbl("r.squared")

##         4         6         8 
## 0.5086326 0.4645102 0.4229655

DataExplorer

### create_report(iris)  - EDA Report 총괄적 생성

data(iris)

plot_missing(iris)

plot_intro(iris) # introduce(iris)

plot_histogram(iris)

plot_density(iris)

plot_boxplot(iris, by= 'Species',  ncol = 4)

plot_correlation(iris, cor_args = list( 'use' = "complete.obs")) #"pairwise.complete.obs"

plot_correlation(iris, type = "c")

plot_intro(iris,
           ggtheme = theme_minimal(),
           title = "Automated EDA with Data Explorer")

***

summarytools

# view(dfSummary(iris))
dfSummary(iris)

## Data Frame Summary  
## iris  
## Dimensions: 150 x 5  
## Duplicates: 1  
## 
## ----------------------------------------------------------------------------------------------------------------------
## No   Variable        Stats / Values           Freqs (% of Valid)   Graph                            Valid    Missing  
## ---- --------------- ------------------------ -------------------- -------------------------------- -------- ---------
## 1    Sepal.Length    Mean (sd) : 5.8 (0.8)    35 distinct values     . . : :                        150      0        
##      [numeric]       min < med < max:                                : : : :                        (100%)   (0%)     
##                      4.3 < 5.8 < 7.9                                 : : : : :                                        
##                      IQR (CV) : 1.3 (0.1)                            : : : : :                                        
##                                                                    : : : : : : : :                                    
## 
## 2    Sepal.Width     Mean (sd) : 3.1 (0.4)    23 distinct values           :                        150      0        
##      [numeric]       min < med < max:                                      :                        (100%)   (0%)     
##                      2 < 3 < 4.4                                         . :                                          
##                      IQR (CV) : 0.5 (0.1)                              : : : :                                        
##                                                                    . . : : : : : :                                    
## 
## 3    Petal.Length    Mean (sd) : 3.8 (1.8)    43 distinct values   :                                150      0        
##      [numeric]       min < med < max:                              :         . :                    (100%)   (0%)     
##                      1 < 4.3 < 6.9                                 :         : : .                                    
##                      IQR (CV) : 3.5 (0.5)                          : :       : : : .                                  
##                                                                    : :   . : : : : : .                                
## 
## 4    Petal.Width     Mean (sd) : 1.2 (0.8)    22 distinct values   :                                150      0        
##      [numeric]       min < med < max:                              :                                (100%)   (0%)     
##                      0.1 < 1.3 < 2.5                               :       . .   :                                    
##                      IQR (CV) : 1.5 (0.6)                          :       : :   :   .                                
##                                                                    : :   : : : . : : :                                
## 
## 5    Species         1. setosa                50 (33.3%)           IIIIII                           150      0        
##      [factor]        2. versicolor            50 (33.3%)           IIIIII                           (100%)   (0%)     
##                      3. virginica             50 (33.3%)           IIIIII                                             
## ----------------------------------------------------------------------------------------------------------------------

magrittr

iris %>% descr(stats = "common") %>% tb()

## # A tibble: 4 x 8
##   variable    mean         sd           min   med   max   n.valid pct.valid
##   <chr>       <chr>        <chr>        <chr> <chr> <chr> <chr>   <chr>    
## 1 Petal.Leng~ 3.758        1.765298233~ 1     4.35  6.9   150     100      
## 2 Petal.Width 1.199333333~ 0.762237668~ 0.1   1.3   2.5   150     100      
## 3 Sepal.Leng~ 5.843333333~ 0.828066127~ 4.3   5.8   7.9   150     100      
## 4 Sepal.Width 3.057333333~ 0.435866284~ 2     3     4.4   150     100

iris$Species %>% freq(cumul = FALSE, report.nas = FALSE) %>% tb()

## # A tibble: 3 x 3
##   value       freq   pct
##   <fct>      <dbl> <dbl>
## 1 setosa        50  33.3
## 2 versicolor    50  33.3
## 3 virginica     50  33.3

tobacco %$% ctable(gender, smoker, chisq = TRUE, headings = FALSE)  # Chi.squared

## 
## -------- -------- ------------- ------------- ---------------
##            smoker           Yes            No           Total
##   gender                                                     
##        F            147 (30.1%)   342 (69.9%)    489 (100.0%)
##        M            143 (29.2%)   346 (70.8%)    489 (100.0%)
##     <NA>              8 (36.4%)    14 (63.6%)     22 (100.0%)
##    Total            298 (29.8%)   702 (70.2%)   1000 (100.0%)
## -------- -------- ------------- ------------- ---------------
## 
## ----------------------------
##  Chi.squared   df   p.value 
## ------------- ---- ---------
##    0.5415      2    0.7628  
## ----------------------------

mlr

data(iris)
task <- makeClassifTask(id= "iris", iris, target="Species")
learner <- makeLearner("classif.randomForest")
holdout(learner, task)

## Resample Result
## Task: iris
## Learner: classif.randomForest
## Aggr perf: mmce.test.mean=0.0800000
## Runtime: 0.03391

holdout(learner, task, measures = acc)  # 90~96% accuracy - not bad!

## Resample Result
## Task: iris
## Learner: classif.randomForest
## Aggr perf: acc.test.mean=0.9200000
## Runtime: 0.0289218

parsnip

data(mtcars)
fit <- linear_reg("regression") %>%    #Build a linear regression model
  set_engine("lm") %>% 
  fit(mpg~.,data=mtcars)
fit

## parsnip model object
## 
## 
## Call:
## stats::lm(formula = formula, data = data)
## 
## Coefficients:
## (Intercept)          cyl         disp           hp         drat  
##    12.30337     -0.11144      0.01334     -0.02148      0.78711  
##          wt         qsec           vs           am         gear  
##    -3.71530      0.82104      0.31776      2.52023      0.65541  
##        carb  
##    -0.19942

ranger

# Ranger is a fast implementation of random forests (Breiman 2001) or recursive partitioning,  particularly suited for high dimensional data. 

data(iris)
ranger(Species ~ ., data = iris,num.trees=100,mtry=3) ## Classification forest

## Ranger result
## 
## Call:
##  ranger(Species ~ ., data = iris, num.trees = 100, mtry = 3) 
## 
## Type:                             Classification 
## Number of trees:                  100 
## Sample size:                      150 
## Number of independent variables:  4 
## Mtry:                             3 
## Target node size:                 1 
## Variable importance mode:         none 
## Splitrule:                        gini 
## OOB prediction error:             5.33 %

train.idx <- sample(nrow(iris), 2/3 * nrow(iris)) 
iris.train <- iris[train.idx, ] 
iris.test <- iris[-train.idx, ] 
rg.iris <- ranger(Species ~ ., data = iris.train) 
pred.iris <- predict(rg.iris, data = iris.test)     ## Prediction 
table(iris.test$Species, pred.iris$predictions)     #Build a confusion matrix

##             
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         21         0
##   virginica       0          2        12

GGally :: ggpairs

# library(fueleconomy)
data(vehicles)
df <- vehicles[1:100, ]
str(df)

## Classes 'tbl_df', 'tbl' and 'data.frame':    100 obs. of  12 variables:
##  $ id   : int  27550 28426 27549 28425 1032 1033 3347 13309 13310 13311 ...
##  $ make : chr  "AM General" "AM General" "AM General" "AM General" ...
##  $ model: chr  "DJ Po Vehicle 2WD" "DJ Po Vehicle 2WD" "FJ8c Post Office" "FJ8c Post Office" ...
##  $ year : int  1984 1984 1984 1984 1985 1985 1987 1997 1997 1997 ...
##  $ class: chr  "Special Purpose Vehicle 2WD" "Special Purpose Vehicle 2WD" "Special Purpose Vehicle 2WD" "Special Purpose Vehicle 2WD" ...
##  $ trans: chr  "Automatic 3-spd" "Automatic 3-spd" "Automatic 3-spd" "Automatic 3-spd" ...
##  $ drive: chr  "2-Wheel Drive" "2-Wheel Drive" "2-Wheel Drive" "2-Wheel Drive" ...
##  $ cyl  : int  4 4 6 6 4 6 6 4 4 6 ...
##  $ displ: num  2.5 2.5 4.2 4.2 2.5 4.2 3.8 2.2 2.2 3 ...
##  $ fuel : chr  "Regular" "Regular" "Regular" "Regular" ...
##  $ hwy  : int  17 17 13 13 17 13 21 26 28 26 ...
##  $ cty  : int  18 18 13 13 16 13 14 20 22 18 ...

# library(GGally)
quant_df <- df[, c("cyl", "hwy", "cty")]
ggpairs(quant_df)

cat_df <- df[, c("fuel", "make", "drive")]
ggpairs(cat_df)

***

8 useful packages

updragon

2019 8 14

8 useful packages

packages

purrr & tidyverse

DataExplorer

summarytools

magrittr

mlr

parsnip

ranger

GGally :: ggpairs