Useful R Packages for Data Science You Aren??t Using (But Should!)
[SITE] https://github.com/dcomtois/summarytools
library(tidyverse) # purrr
library(summarytools)
library(magrittr)
library(DataExplorer) # DataExplorer
library(esquisse) # ggplot2 builder
library(mlr) # Machine Learning
library(parsnip)
library(ranger)
library(fueleconomy)
library(GGally)
data(mtcars)
mtcars %>%
split(.$cyl) %>% #selecting cylinder to create three sets of data using the cyl values
map(~ lm(mpg ~ wt, data = .)) %>%
map(summary) %>%
map_dbl("r.squared")
## 4 6 8
## 0.5086326 0.4645102 0.4229655
### create_report(iris) - EDA Report 총괄적 생성
data(iris)
plot_missing(iris)
plot_intro(iris) # introduce(iris)
plot_histogram(iris)
plot_density(iris)
plot_boxplot(iris, by= 'Species', ncol = 4)
plot_correlation(iris, cor_args = list( 'use' = "complete.obs")) #"pairwise.complete.obs"
plot_correlation(iris, type = "c")
plot_intro(iris,
ggtheme = theme_minimal(),
title = "Automated EDA with Data Explorer")
***
# view(dfSummary(iris))
dfSummary(iris)
## Data Frame Summary
## iris
## Dimensions: 150 x 5
## Duplicates: 1
##
## ----------------------------------------------------------------------------------------------------------------------
## No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
## ---- --------------- ------------------------ -------------------- -------------------------------- -------- ---------
## 1 Sepal.Length Mean (sd) : 5.8 (0.8) 35 distinct values . . : : 150 0
## [numeric] min < med < max: : : : : (100%) (0%)
## 4.3 < 5.8 < 7.9 : : : : :
## IQR (CV) : 1.3 (0.1) : : : : :
## : : : : : : : :
##
## 2 Sepal.Width Mean (sd) : 3.1 (0.4) 23 distinct values : 150 0
## [numeric] min < med < max: : (100%) (0%)
## 2 < 3 < 4.4 . :
## IQR (CV) : 0.5 (0.1) : : : :
## . . : : : : : :
##
## 3 Petal.Length Mean (sd) : 3.8 (1.8) 43 distinct values : 150 0
## [numeric] min < med < max: : . : (100%) (0%)
## 1 < 4.3 < 6.9 : : : .
## IQR (CV) : 3.5 (0.5) : : : : : .
## : : . : : : : : .
##
## 4 Petal.Width Mean (sd) : 1.2 (0.8) 22 distinct values : 150 0
## [numeric] min < med < max: : (100%) (0%)
## 0.1 < 1.3 < 2.5 : . . :
## IQR (CV) : 1.5 (0.6) : : : : .
## : : : : : . : : :
##
## 5 Species 1. setosa 50 (33.3%) IIIIII 150 0
## [factor] 2. versicolor 50 (33.3%) IIIIII (100%) (0%)
## 3. virginica 50 (33.3%) IIIIII
## ----------------------------------------------------------------------------------------------------------------------
iris %>% descr(stats = "common") %>% tb()
## # A tibble: 4 x 8
## variable mean sd min med max n.valid pct.valid
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Petal.Leng~ 3.758 1.765298233~ 1 4.35 6.9 150 100
## 2 Petal.Width 1.199333333~ 0.762237668~ 0.1 1.3 2.5 150 100
## 3 Sepal.Leng~ 5.843333333~ 0.828066127~ 4.3 5.8 7.9 150 100
## 4 Sepal.Width 3.057333333~ 0.435866284~ 2 3 4.4 150 100
iris$Species %>% freq(cumul = FALSE, report.nas = FALSE) %>% tb()
## # A tibble: 3 x 3
## value freq pct
## <fct> <dbl> <dbl>
## 1 setosa 50 33.3
## 2 versicolor 50 33.3
## 3 virginica 50 33.3
tobacco %$% ctable(gender, smoker, chisq = TRUE, headings = FALSE) # Chi.squared
##
## -------- -------- ------------- ------------- ---------------
## smoker Yes No Total
## gender
## F 147 (30.1%) 342 (69.9%) 489 (100.0%)
## M 143 (29.2%) 346 (70.8%) 489 (100.0%)
## <NA> 8 (36.4%) 14 (63.6%) 22 (100.0%)
## Total 298 (29.8%) 702 (70.2%) 1000 (100.0%)
## -------- -------- ------------- ------------- ---------------
##
## ----------------------------
## Chi.squared df p.value
## ------------- ---- ---------
## 0.5415 2 0.7628
## ----------------------------
data(iris)
task <- makeClassifTask(id= "iris", iris, target="Species")
learner <- makeLearner("classif.randomForest")
holdout(learner, task)
## Resample Result
## Task: iris
## Learner: classif.randomForest
## Aggr perf: mmce.test.mean=0.0800000
## Runtime: 0.03391
holdout(learner, task, measures = acc) # 90~96% accuracy - not bad!
## Resample Result
## Task: iris
## Learner: classif.randomForest
## Aggr perf: acc.test.mean=0.9200000
## Runtime: 0.0289218
data(mtcars)
fit <- linear_reg("regression") %>% #Build a linear regression model
set_engine("lm") %>%
fit(mpg~.,data=mtcars)
fit
## parsnip model object
##
##
## Call:
## stats::lm(formula = formula, data = data)
##
## Coefficients:
## (Intercept) cyl disp hp drat
## 12.30337 -0.11144 0.01334 -0.02148 0.78711
## wt qsec vs am gear
## -3.71530 0.82104 0.31776 2.52023 0.65541
## carb
## -0.19942
# Ranger is a fast implementation of random forests (Breiman 2001) or recursive partitioning, particularly suited for high dimensional data.
data(iris)
ranger(Species ~ ., data = iris,num.trees=100,mtry=3) ## Classification forest
## Ranger result
##
## Call:
## ranger(Species ~ ., data = iris, num.trees = 100, mtry = 3)
##
## Type: Classification
## Number of trees: 100
## Sample size: 150
## Number of independent variables: 4
## Mtry: 3
## Target node size: 1
## Variable importance mode: none
## Splitrule: gini
## OOB prediction error: 5.33 %
train.idx <- sample(nrow(iris), 2/3 * nrow(iris))
iris.train <- iris[train.idx, ]
iris.test <- iris[-train.idx, ]
rg.iris <- ranger(Species ~ ., data = iris.train)
pred.iris <- predict(rg.iris, data = iris.test) ## Prediction
table(iris.test$Species, pred.iris$predictions) #Build a confusion matrix
##
## setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 21 0
## virginica 0 2 12
# library(fueleconomy)
data(vehicles)
df <- vehicles[1:100, ]
str(df)
## Classes 'tbl_df', 'tbl' and 'data.frame': 100 obs. of 12 variables:
## $ id : int 27550 28426 27549 28425 1032 1033 3347 13309 13310 13311 ...
## $ make : chr "AM General" "AM General" "AM General" "AM General" ...
## $ model: chr "DJ Po Vehicle 2WD" "DJ Po Vehicle 2WD" "FJ8c Post Office" "FJ8c Post Office" ...
## $ year : int 1984 1984 1984 1984 1985 1985 1987 1997 1997 1997 ...
## $ class: chr "Special Purpose Vehicle 2WD" "Special Purpose Vehicle 2WD" "Special Purpose Vehicle 2WD" "Special Purpose Vehicle 2WD" ...
## $ trans: chr "Automatic 3-spd" "Automatic 3-spd" "Automatic 3-spd" "Automatic 3-spd" ...
## $ drive: chr "2-Wheel Drive" "2-Wheel Drive" "2-Wheel Drive" "2-Wheel Drive" ...
## $ cyl : int 4 4 6 6 4 6 6 4 4 6 ...
## $ displ: num 2.5 2.5 4.2 4.2 2.5 4.2 3.8 2.2 2.2 3 ...
## $ fuel : chr "Regular" "Regular" "Regular" "Regular" ...
## $ hwy : int 17 17 13 13 17 13 21 26 28 26 ...
## $ cty : int 18 18 13 13 16 13 14 20 22 18 ...
# library(GGally)
quant_df <- df[, c("cyl", "hwy", "cty")]
ggpairs(quant_df)
cat_df <- df[, c("fuel", "make", "drive")]
ggpairs(cat_df)
***