library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
data(mpg, package = "ggplot2")
dim(mpg)
## [1] 234  11
str(mpg)
## Classes 'tbl_df', 'tbl' and 'data.frame':    234 obs. of  11 variables:
##  $ manufacturer: chr  "audi" "audi" "audi" "audi" ...
##  $ model       : chr  "a4" "a4" "a4" "a4" ...
##  $ displ       : num  1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int  1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int  4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : chr  "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
##  $ drv         : chr  "f" "f" "f" "f" ...
##  $ cty         : int  18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int  29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : chr  "p" "p" "p" "p" ...
##  $ class       : chr  "compact" "compact" "compact" "compact" ...
knitr::kable(mpg)
manufacturer model displ year cyl trans drv cty hwy fl class
audi a4 1.8 1999 4 auto(l5) f 18 29 p compact
audi a4 1.8 1999 4 manual(m5) f 21 29 p compact
audi a4 2.0 2008 4 manual(m6) f 20 31 p compact
audi a4 2.0 2008 4 auto(av) f 21 30 p compact
audi a4 2.8 1999 6 auto(l5) f 16 26 p compact
audi a4 2.8 1999 6 manual(m5) f 18 26 p compact
audi a4 3.1 2008 6 auto(av) f 18 27 p compact
audi a4 quattro 1.8 1999 4 manual(m5) 4 18 26 p compact
audi a4 quattro 1.8 1999 4 auto(l5) 4 16 25 p compact
audi a4 quattro 2.0 2008 4 manual(m6) 4 20 28 p compact
audi a4 quattro 2.0 2008 4 auto(s6) 4 19 27 p compact
audi a4 quattro 2.8 1999 6 auto(l5) 4 15 25 p compact
audi a4 quattro 2.8 1999 6 manual(m5) 4 17 25 p compact
audi a4 quattro 3.1 2008 6 auto(s6) 4 17 25 p compact
audi a4 quattro 3.1 2008 6 manual(m6) 4 15 25 p compact
audi a6 quattro 2.8 1999 6 auto(l5) 4 15 24 p midsize
audi a6 quattro 3.1 2008 6 auto(s6) 4 17 25 p midsize
audi a6 quattro 4.2 2008 8 auto(s6) 4 16 23 p midsize
chevrolet c1500 suburban 2wd 5.3 2008 8 auto(l4) r 14 20 r suv
chevrolet c1500 suburban 2wd 5.3 2008 8 auto(l4) r 11 15 e suv
chevrolet c1500 suburban 2wd 5.3 2008 8 auto(l4) r 14 20 r suv
chevrolet c1500 suburban 2wd 5.7 1999 8 auto(l4) r 13 17 r suv
chevrolet c1500 suburban 2wd 6.0 2008 8 auto(l4) r 12 17 r suv
chevrolet corvette 5.7 1999 8 manual(m6) r 16 26 p 2seater
chevrolet corvette 5.7 1999 8 auto(l4) r 15 23 p 2seater
chevrolet corvette 6.2 2008 8 manual(m6) r 16 26 p 2seater
chevrolet corvette 6.2 2008 8 auto(s6) r 15 25 p 2seater
chevrolet corvette 7.0 2008 8 manual(m6) r 15 24 p 2seater
chevrolet k1500 tahoe 4wd 5.3 2008 8 auto(l4) 4 14 19 r suv
chevrolet k1500 tahoe 4wd 5.3 2008 8 auto(l4) 4 11 14 e suv
chevrolet k1500 tahoe 4wd 5.7 1999 8 auto(l4) 4 11 15 r suv
chevrolet k1500 tahoe 4wd 6.5 1999 8 auto(l4) 4 14 17 d suv
chevrolet malibu 2.4 1999 4 auto(l4) f 19 27 r midsize
chevrolet malibu 2.4 2008 4 auto(l4) f 22 30 r midsize
chevrolet malibu 3.1 1999 6 auto(l4) f 18 26 r midsize
chevrolet malibu 3.5 2008 6 auto(l4) f 18 29 r midsize
chevrolet malibu 3.6 2008 6 auto(s6) f 17 26 r midsize
dodge caravan 2wd 2.4 1999 4 auto(l3) f 18 24 r minivan
dodge caravan 2wd 3.0 1999 6 auto(l4) f 17 24 r minivan
dodge caravan 2wd 3.3 1999 6 auto(l4) f 16 22 r minivan
dodge caravan 2wd 3.3 1999 6 auto(l4) f 16 22 r minivan
dodge caravan 2wd 3.3 2008 6 auto(l4) f 17 24 r minivan
dodge caravan 2wd 3.3 2008 6 auto(l4) f 17 24 r minivan
dodge caravan 2wd 3.3 2008 6 auto(l4) f 11 17 e minivan
dodge caravan 2wd 3.8 1999 6 auto(l4) f 15 22 r minivan
dodge caravan 2wd 3.8 1999 6 auto(l4) f 15 21 r minivan
dodge caravan 2wd 3.8 2008 6 auto(l6) f 16 23 r minivan
dodge caravan 2wd 4.0 2008 6 auto(l6) f 16 23 r minivan
dodge dakota pickup 4wd 3.7 2008 6 manual(m6) 4 15 19 r pickup
dodge dakota pickup 4wd 3.7 2008 6 auto(l4) 4 14 18 r pickup
dodge dakota pickup 4wd 3.9 1999 6 auto(l4) 4 13 17 r pickup
dodge dakota pickup 4wd 3.9 1999 6 manual(m5) 4 14 17 r pickup
dodge dakota pickup 4wd 4.7 2008 8 auto(l5) 4 14 19 r pickup
dodge dakota pickup 4wd 4.7 2008 8 auto(l5) 4 14 19 r pickup
dodge dakota pickup 4wd 4.7 2008 8 auto(l5) 4 9 12 e pickup
dodge dakota pickup 4wd 5.2 1999 8 manual(m5) 4 11 17 r pickup
dodge dakota pickup 4wd 5.2 1999 8 auto(l4) 4 11 15 r pickup
dodge durango 4wd 3.9 1999 6 auto(l4) 4 13 17 r suv
dodge durango 4wd 4.7 2008 8 auto(l5) 4 13 17 r suv
dodge durango 4wd 4.7 2008 8 auto(l5) 4 9 12 e suv
dodge durango 4wd 4.7 2008 8 auto(l5) 4 13 17 r suv
dodge durango 4wd 5.2 1999 8 auto(l4) 4 11 16 r suv
dodge durango 4wd 5.7 2008 8 auto(l5) 4 13 18 r suv
dodge durango 4wd 5.9 1999 8 auto(l4) 4 11 15 r suv
dodge ram 1500 pickup 4wd 4.7 2008 8 manual(m6) 4 12 16 r pickup
dodge ram 1500 pickup 4wd 4.7 2008 8 auto(l5) 4 9 12 e pickup
dodge ram 1500 pickup 4wd 4.7 2008 8 auto(l5) 4 13 17 r pickup
dodge ram 1500 pickup 4wd 4.7 2008 8 auto(l5) 4 13 17 r pickup
dodge ram 1500 pickup 4wd 4.7 2008 8 manual(m6) 4 12 16 r pickup
dodge ram 1500 pickup 4wd 4.7 2008 8 manual(m6) 4 9 12 e pickup
dodge ram 1500 pickup 4wd 5.2 1999 8 auto(l4) 4 11 15 r pickup
dodge ram 1500 pickup 4wd 5.2 1999 8 manual(m5) 4 11 16 r pickup
dodge ram 1500 pickup 4wd 5.7 2008 8 auto(l5) 4 13 17 r pickup
dodge ram 1500 pickup 4wd 5.9 1999 8 auto(l4) 4 11 15 r pickup
ford expedition 2wd 4.6 1999 8 auto(l4) r 11 17 r suv
ford expedition 2wd 5.4 1999 8 auto(l4) r 11 17 r suv
ford expedition 2wd 5.4 2008 8 auto(l6) r 12 18 r suv
ford explorer 4wd 4.0 1999 6 auto(l5) 4 14 17 r suv
ford explorer 4wd 4.0 1999 6 manual(m5) 4 15 19 r suv
ford explorer 4wd 4.0 1999 6 auto(l5) 4 14 17 r suv
ford explorer 4wd 4.0 2008 6 auto(l5) 4 13 19 r suv
ford explorer 4wd 4.6 2008 8 auto(l6) 4 13 19 r suv
ford explorer 4wd 5.0 1999 8 auto(l4) 4 13 17 r suv
ford f150 pickup 4wd 4.2 1999 6 auto(l4) 4 14 17 r pickup
ford f150 pickup 4wd 4.2 1999 6 manual(m5) 4 14 17 r pickup
ford f150 pickup 4wd 4.6 1999 8 manual(m5) 4 13 16 r pickup
ford f150 pickup 4wd 4.6 1999 8 auto(l4) 4 13 16 r pickup
ford f150 pickup 4wd 4.6 2008 8 auto(l4) 4 13 17 r pickup
ford f150 pickup 4wd 5.4 1999 8 auto(l4) 4 11 15 r pickup
ford f150 pickup 4wd 5.4 2008 8 auto(l4) 4 13 17 r pickup
ford mustang 3.8 1999 6 manual(m5) r 18 26 r subcompact
ford mustang 3.8 1999 6 auto(l4) r 18 25 r subcompact
ford mustang 4.0 2008 6 manual(m5) r 17 26 r subcompact
ford mustang 4.0 2008 6 auto(l5) r 16 24 r subcompact
ford mustang 4.6 1999 8 auto(l4) r 15 21 r subcompact
ford mustang 4.6 1999 8 manual(m5) r 15 22 r subcompact
ford mustang 4.6 2008 8 manual(m5) r 15 23 r subcompact
ford mustang 4.6 2008 8 auto(l5) r 15 22 r subcompact
ford mustang 5.4 2008 8 manual(m6) r 14 20 p subcompact
honda civic 1.6 1999 4 manual(m5) f 28 33 r subcompact
honda civic 1.6 1999 4 auto(l4) f 24 32 r subcompact
honda civic 1.6 1999 4 manual(m5) f 25 32 r subcompact
honda civic 1.6 1999 4 manual(m5) f 23 29 p subcompact
honda civic 1.6 1999 4 auto(l4) f 24 32 r subcompact
honda civic 1.8 2008 4 manual(m5) f 26 34 r subcompact
honda civic 1.8 2008 4 auto(l5) f 25 36 r subcompact
honda civic 1.8 2008 4 auto(l5) f 24 36 c subcompact
honda civic 2.0 2008 4 manual(m6) f 21 29 p subcompact
hyundai sonata 2.4 1999 4 auto(l4) f 18 26 r midsize
hyundai sonata 2.4 1999 4 manual(m5) f 18 27 r midsize
hyundai sonata 2.4 2008 4 auto(l4) f 21 30 r midsize
hyundai sonata 2.4 2008 4 manual(m5) f 21 31 r midsize
hyundai sonata 2.5 1999 6 auto(l4) f 18 26 r midsize
hyundai sonata 2.5 1999 6 manual(m5) f 18 26 r midsize
hyundai sonata 3.3 2008 6 auto(l5) f 19 28 r midsize
hyundai tiburon 2.0 1999 4 auto(l4) f 19 26 r subcompact
hyundai tiburon 2.0 1999 4 manual(m5) f 19 29 r subcompact
hyundai tiburon 2.0 2008 4 manual(m5) f 20 28 r subcompact
hyundai tiburon 2.0 2008 4 auto(l4) f 20 27 r subcompact
hyundai tiburon 2.7 2008 6 auto(l4) f 17 24 r subcompact
hyundai tiburon 2.7 2008 6 manual(m6) f 16 24 r subcompact
hyundai tiburon 2.7 2008 6 manual(m5) f 17 24 r subcompact
jeep grand cherokee 4wd 3.0 2008 6 auto(l5) 4 17 22 d suv
jeep grand cherokee 4wd 3.7 2008 6 auto(l5) 4 15 19 r suv
jeep grand cherokee 4wd 4.0 1999 6 auto(l4) 4 15 20 r suv
jeep grand cherokee 4wd 4.7 1999 8 auto(l4) 4 14 17 r suv
jeep grand cherokee 4wd 4.7 2008 8 auto(l5) 4 9 12 e suv
jeep grand cherokee 4wd 4.7 2008 8 auto(l5) 4 14 19 r suv
jeep grand cherokee 4wd 5.7 2008 8 auto(l5) 4 13 18 r suv
jeep grand cherokee 4wd 6.1 2008 8 auto(l5) 4 11 14 p suv
land rover range rover 4.0 1999 8 auto(l4) 4 11 15 p suv
land rover range rover 4.2 2008 8 auto(s6) 4 12 18 r suv
land rover range rover 4.4 2008 8 auto(s6) 4 12 18 r suv
land rover range rover 4.6 1999 8 auto(l4) 4 11 15 p suv
lincoln navigator 2wd 5.4 1999 8 auto(l4) r 11 17 r suv
lincoln navigator 2wd 5.4 1999 8 auto(l4) r 11 16 p suv
lincoln navigator 2wd 5.4 2008 8 auto(l6) r 12 18 r suv
mercury mountaineer 4wd 4.0 1999 6 auto(l5) 4 14 17 r suv
mercury mountaineer 4wd 4.0 2008 6 auto(l5) 4 13 19 r suv
mercury mountaineer 4wd 4.6 2008 8 auto(l6) 4 13 19 r suv
mercury mountaineer 4wd 5.0 1999 8 auto(l4) 4 13 17 r suv
nissan altima 2.4 1999 4 manual(m5) f 21 29 r compact
nissan altima 2.4 1999 4 auto(l4) f 19 27 r compact
nissan altima 2.5 2008 4 auto(av) f 23 31 r midsize
nissan altima 2.5 2008 4 manual(m6) f 23 32 r midsize
nissan altima 3.5 2008 6 manual(m6) f 19 27 p midsize
nissan altima 3.5 2008 6 auto(av) f 19 26 p midsize
nissan maxima 3.0 1999 6 auto(l4) f 18 26 r midsize
nissan maxima 3.0 1999 6 manual(m5) f 19 25 r midsize
nissan maxima 3.5 2008 6 auto(av) f 19 25 p midsize
nissan pathfinder 4wd 3.3 1999 6 auto(l4) 4 14 17 r suv
nissan pathfinder 4wd 3.3 1999 6 manual(m5) 4 15 17 r suv
nissan pathfinder 4wd 4.0 2008 6 auto(l5) 4 14 20 p suv
nissan pathfinder 4wd 5.6 2008 8 auto(s5) 4 12 18 p suv
pontiac grand prix 3.1 1999 6 auto(l4) f 18 26 r midsize
pontiac grand prix 3.8 1999 6 auto(l4) f 16 26 p midsize
pontiac grand prix 3.8 1999 6 auto(l4) f 17 27 r midsize
pontiac grand prix 3.8 2008 6 auto(l4) f 18 28 r midsize
pontiac grand prix 5.3 2008 8 auto(s4) f 16 25 p midsize
subaru forester awd 2.5 1999 4 manual(m5) 4 18 25 r suv
subaru forester awd 2.5 1999 4 auto(l4) 4 18 24 r suv
subaru forester awd 2.5 2008 4 manual(m5) 4 20 27 r suv
subaru forester awd 2.5 2008 4 manual(m5) 4 19 25 p suv
subaru forester awd 2.5 2008 4 auto(l4) 4 20 26 r suv
subaru forester awd 2.5 2008 4 auto(l4) 4 18 23 p suv
subaru impreza awd 2.2 1999 4 auto(l4) 4 21 26 r subcompact
subaru impreza awd 2.2 1999 4 manual(m5) 4 19 26 r subcompact
subaru impreza awd 2.5 1999 4 manual(m5) 4 19 26 r subcompact
subaru impreza awd 2.5 1999 4 auto(l4) 4 19 26 r subcompact
subaru impreza awd 2.5 2008 4 auto(s4) 4 20 25 p compact
subaru impreza awd 2.5 2008 4 auto(s4) 4 20 27 r compact
subaru impreza awd 2.5 2008 4 manual(m5) 4 19 25 p compact
subaru impreza awd 2.5 2008 4 manual(m5) 4 20 27 r compact
toyota 4runner 4wd 2.7 1999 4 manual(m5) 4 15 20 r suv
toyota 4runner 4wd 2.7 1999 4 auto(l4) 4 16 20 r suv
toyota 4runner 4wd 3.4 1999 6 auto(l4) 4 15 19 r suv
toyota 4runner 4wd 3.4 1999 6 manual(m5) 4 15 17 r suv
toyota 4runner 4wd 4.0 2008 6 auto(l5) 4 16 20 r suv
toyota 4runner 4wd 4.7 2008 8 auto(l5) 4 14 17 r suv
toyota camry 2.2 1999 4 manual(m5) f 21 29 r midsize
toyota camry 2.2 1999 4 auto(l4) f 21 27 r midsize
toyota camry 2.4 2008 4 manual(m5) f 21 31 r midsize
toyota camry 2.4 2008 4 auto(l5) f 21 31 r midsize
toyota camry 3.0 1999 6 auto(l4) f 18 26 r midsize
toyota camry 3.0 1999 6 manual(m5) f 18 26 r midsize
toyota camry 3.5 2008 6 auto(s6) f 19 28 r midsize
toyota camry solara 2.2 1999 4 auto(l4) f 21 27 r compact
toyota camry solara 2.2 1999 4 manual(m5) f 21 29 r compact
toyota camry solara 2.4 2008 4 manual(m5) f 21 31 r compact
toyota camry solara 2.4 2008 4 auto(s5) f 22 31 r compact
toyota camry solara 3.0 1999 6 auto(l4) f 18 26 r compact
toyota camry solara 3.0 1999 6 manual(m5) f 18 26 r compact
toyota camry solara 3.3 2008 6 auto(s5) f 18 27 r compact
toyota corolla 1.8 1999 4 auto(l3) f 24 30 r compact
toyota corolla 1.8 1999 4 auto(l4) f 24 33 r compact
toyota corolla 1.8 1999 4 manual(m5) f 26 35 r compact
toyota corolla 1.8 2008 4 manual(m5) f 28 37 r compact
toyota corolla 1.8 2008 4 auto(l4) f 26 35 r compact
toyota land cruiser wagon 4wd 4.7 1999 8 auto(l4) 4 11 15 r suv
toyota land cruiser wagon 4wd 5.7 2008 8 auto(s6) 4 13 18 r suv
toyota toyota tacoma 4wd 2.7 1999 4 manual(m5) 4 15 20 r pickup
toyota toyota tacoma 4wd 2.7 1999 4 auto(l4) 4 16 20 r pickup
toyota toyota tacoma 4wd 2.7 2008 4 manual(m5) 4 17 22 r pickup
toyota toyota tacoma 4wd 3.4 1999 6 manual(m5) 4 15 17 r pickup
toyota toyota tacoma 4wd 3.4 1999 6 auto(l4) 4 15 19 r pickup
toyota toyota tacoma 4wd 4.0 2008 6 manual(m6) 4 15 18 r pickup
toyota toyota tacoma 4wd 4.0 2008 6 auto(l5) 4 16 20 r pickup
volkswagen gti 2.0 1999 4 manual(m5) f 21 29 r compact
volkswagen gti 2.0 1999 4 auto(l4) f 19 26 r compact
volkswagen gti 2.0 2008 4 manual(m6) f 21 29 p compact
volkswagen gti 2.0 2008 4 auto(s6) f 22 29 p compact
volkswagen gti 2.8 1999 6 manual(m5) f 17 24 r compact
volkswagen jetta 1.9 1999 4 manual(m5) f 33 44 d compact
volkswagen jetta 2.0 1999 4 manual(m5) f 21 29 r compact
volkswagen jetta 2.0 1999 4 auto(l4) f 19 26 r compact
volkswagen jetta 2.0 2008 4 auto(s6) f 22 29 p compact
volkswagen jetta 2.0 2008 4 manual(m6) f 21 29 p compact
volkswagen jetta 2.5 2008 5 auto(s6) f 21 29 r compact
volkswagen jetta 2.5 2008 5 manual(m5) f 21 29 r compact
volkswagen jetta 2.8 1999 6 auto(l4) f 16 23 r compact
volkswagen jetta 2.8 1999 6 manual(m5) f 17 24 r compact
volkswagen new beetle 1.9 1999 4 manual(m5) f 35 44 d subcompact
volkswagen new beetle 1.9 1999 4 auto(l4) f 29 41 d subcompact
volkswagen new beetle 2.0 1999 4 manual(m5) f 21 29 r subcompact
volkswagen new beetle 2.0 1999 4 auto(l4) f 19 26 r subcompact
volkswagen new beetle 2.5 2008 5 manual(m5) f 20 28 r subcompact
volkswagen new beetle 2.5 2008 5 auto(s6) f 20 29 r subcompact
volkswagen passat 1.8 1999 4 manual(m5) f 21 29 p midsize
volkswagen passat 1.8 1999 4 auto(l5) f 18 29 p midsize
volkswagen passat 2.0 2008 4 auto(s6) f 19 28 p midsize
volkswagen passat 2.0 2008 4 manual(m6) f 21 29 p midsize
volkswagen passat 2.8 1999 6 auto(l5) f 16 26 p midsize
volkswagen passat 2.8 1999 6 manual(m5) f 18 26 p midsize
volkswagen passat 3.6 2008 6 auto(s6) f 17 26 p midsize
char_var <- sapply(mpg, is.character)
mpg[ , char_var] <- lapply(mpg[ , char_var], as.factor)
str(mpg)
## Classes 'tbl_df', 'tbl' and 'data.frame':    234 obs. of  11 variables:
##  $ manufacturer: Factor w/ 15 levels "audi","chevrolet",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ model       : Factor w/ 38 levels "4runner 4wd",..: 2 2 2 2 2 2 2 3 3 3 ...
##  $ displ       : num  1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int  1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int  4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : Factor w/ 10 levels "auto(av)","auto(l3)",..: 4 9 10 1 4 9 1 9 4 10 ...
##  $ drv         : Factor w/ 3 levels "4","f","r": 2 2 2 2 2 2 2 1 1 1 ...
##  $ cty         : int  18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int  29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : Factor w/ 5 levels "c","d","e","p",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ class       : Factor w/ 7 levels "2seater","compact",..: 2 2 2 2 2 2 2 2 2 2 ...

Discretization

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(arules)
## Warning: package 'arules' was built under R version 3.5.3
## Loading required package: Matrix
## 
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
mpg$displ_grp <- discretize(mpg$displ, method = "frequency",
                            breaks = 3, labels = c("low", "medium", "high"))
mpg %>%
  group_by(displ_grp) %>%
  summarize(avg_displ = mean(displ), count = n(), min = min(displ), max = max(displ))
## # A tibble: 3 x 5
##   displ_grp avg_displ count   min   max
##   <fct>         <dbl> <int> <dbl> <dbl>
## 1 low            2.02    62   1.6   2.4
## 2 medium         3.06    86   2.5   3.9
## 3 high           4.93    86   4     7

Data Normalization and Standardization:

mpg$displ_scale <- scale(mpg$displ, center = TRUE, scale = TRUE)
summary(mpg$displ)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.600   2.400   3.300   3.472   4.600   7.000
sd(mpg$displ)
## [1] 1.291959
summary(mpg$displ_scale)
##        V1         
##  Min.   :-1.4488  
##  1st Qu.:-0.8296  
##  Median :-0.1330  
##  Mean   : 0.0000  
##  3rd Qu.: 0.8733  
##  Max.   : 2.7309
sd(mpg$displ_scale)
## [1] 1

Deal with Data Outliers and Noises (1):

Before removing outliers:

boxplot(mpg$hwy)

mpg$hwy[mpg$hwy %in% boxplot.stats(mpg$hwy)$out] <- median(mpg$hwy, na.rm = T)
boxplot(mpg$hwy)

Deal with Data Outliers and Noises (2):

After removing outliers:

mpg$hwy[mpg$hwy %in% boxplot.stats(mpg$hwy)$out] <- median(mpg$hwy, na.rm = T)
boxplot(mpg$hwy)

Remove Duplicate Data Record:

nrow(mpg)
## [1] 234
nrow(mpg[!duplicated(mpg), ])
## [1] 225
mpg %>%
  distinct( .keep_all = T) %>%
  nrow()
## [1] 225
mpg[duplicated(mpg) | duplicated(mpg, fromLast = T), ]
## # A tibble: 18 x 13
##    manufacturer model displ  year   cyl trans drv     cty   hwy fl    class
##    <fct>        <fct> <dbl> <int> <int> <fct> <fct> <int> <dbl> <fct> <fct>
##  1 chevrolet    c150~   5.3  2008     8 auto~ r        14    20 r     suv  
##  2 chevrolet    c150~   5.3  2008     8 auto~ r        14    20 r     suv  
##  3 dodge        cara~   3.3  1999     6 auto~ f        16    22 r     mini~
##  4 dodge        cara~   3.3  1999     6 auto~ f        16    22 r     mini~
##  5 dodge        cara~   3.3  2008     6 auto~ f        17    24 r     mini~
##  6 dodge        cara~   3.3  2008     6 auto~ f        17    24 r     mini~
##  7 dodge        dako~   4.7  2008     8 auto~ 4        14    19 r     pick~
##  8 dodge        dako~   4.7  2008     8 auto~ 4        14    19 r     pick~
##  9 dodge        dura~   4.7  2008     8 auto~ 4        13    17 r     suv  
## 10 dodge        dura~   4.7  2008     8 auto~ 4        13    17 r     suv  
## 11 dodge        ram ~   4.7  2008     8 manu~ 4        12    16 r     pick~
## 12 dodge        ram ~   4.7  2008     8 auto~ 4        13    17 r     pick~
## 13 dodge        ram ~   4.7  2008     8 auto~ 4        13    17 r     pick~
## 14 dodge        ram ~   4.7  2008     8 manu~ 4        12    16 r     pick~
## 15 ford         expl~   4    1999     6 auto~ 4        14    17 r     suv  
## 16 ford         expl~   4    1999     6 auto~ 4        14    17 r     suv  
## 17 honda        civic   1.6  1999     4 auto~ f        24    32 r     subc~
## 18 honda        civic   1.6  1999     4 auto~ f        24    32 r     subc~
## # ... with 2 more variables: displ_grp <fct>, displ_scale[,1] <dbl>

Aggregation:

Goal: output the first five most fuel economy models:

 mpg %>%
  group_by(manufacturer) %>%
  summarize(avg_cty = mean(cty)) %>%
  arrange(desc(avg_cty)) %>%
  head(10)
## # A tibble: 10 x 2
##    manufacturer avg_cty
##    <fct>          <dbl>
##  1 honda           24.4
##  2 volkswagen      20.9
##  3 subaru          19.3
##  4 hyundai         18.6
##  5 toyota          18.5
##  6 nissan          18.1
##  7 audi            17.6
##  8 pontiac         17  
##  9 chevrolet       15  
## 10 ford            14

Random Data Sampling without Replacement:

mpg_sample_index <- sample(1:nrow(mpg),
                           size = nrow(mpg) * 0.3,
                           replace = F)
prop.table(table(mpg[mpg_sample_index, "drv"]))
## 
##         4         f         r 
## 0.4000000 0.4857143 0.1142857
mpg[mpg_sample_index, ]
## # A tibble: 70 x 13
##    manufacturer model displ  year   cyl trans drv     cty   hwy fl    class
##    <fct>        <fct> <dbl> <int> <int> <fct> <fct> <int> <dbl> <fct> <fct>
##  1 subaru       fore~   2.5  2008     4 manu~ 4        19    25 p     suv  
##  2 volkswagen   jetta   2    1999     4 manu~ f        21    29 r     comp~
##  3 ford         must~   4    2008     6 auto~ r        16    24 r     subc~
##  4 lincoln      navi~   5.4  1999     8 auto~ r        11    17 r     suv  
##  5 subaru       impr~   2.5  1999     4 auto~ 4        19    26 r     subc~
##  6 honda        civic   2    2008     4 manu~ f        21    29 p     subc~
##  7 chevrolet    corv~   7    2008     8 manu~ r        15    24 p     2sea~
##  8 dodge        dako~   4.7  2008     8 auto~ 4         9    12 e     pick~
##  9 volkswagen   gti     2    2008     4 manu~ f        21    29 p     comp~
## 10 chevrolet    corv~   5.7  1999     8 auto~ r        15    23 p     2sea~
## # ... with 60 more rows, and 2 more variables: displ_grp <fct>,
## #   displ_scale[,1] <dbl>

Stratified Data Sampling:

library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
prop.table(table(mpg$drv))
## 
##         4         f         r 
## 0.4401709 0.4529915 0.1068376

Missing Value Manipulations (1):

Check if missing values exist in the dataset. If not, then randomly assign some missing values and then check how many missing values exist for which values

sum(!complete.cases(mpg))
## [1] 0
mpg$hwy[sample(1:length(mpg$hwy), size = 5, replace = F)] <- NA
sum(!complete.cases(mpg))
## [1] 5
sapply(mpg, function(x) sum(is.na(x)))
## manufacturer        model        displ         year          cyl 
##            0            0            0            0            0 
##        trans          drv          cty          hwy           fl 
##            0            0            0            5            0 
##        class    displ_grp  displ_scale 
##            0            0            0

Missing Value Manipulations (2)

Simple imputation method for missing value: replace with mean of the attribute:

mpg2 <- mpg
sum(!complete.cases(mpg2))
## [1] 5
mpg2$hwy[is.na(mpg2$hwy)] <- mean(mpg2$hwy, na.rm = T)
sum(!complete.cases(mpg2))
## [1] 0

More advanced missing value imputation algorithms: kNN imputation:

library(caret)
preprocess <- preProcess(mpg, method = c("knnImpute", "center", "scale"))
mpg3 <- predict(preprocess, mpg)
sum(!complete.cases(mpg3))
## [1] 0

Visualization: Box Plot:

theme_set(theme_bw())

ggplot(mpg, aes(x = displ_grp, y = hwy)) + 
  geom_boxplot() + 
  theme(panel.grid.major.x = element_blank())
## Warning: Removed 5 rows containing non-finite values (stat_boxplot).

Visualization: Scatter Plot (and its Variants):

ggplot(mpg, aes(x = hwy, y = cty)) +
  geom_point(aes(color = as.factor(cyl), size = displ)) + 
  geom_smooth(method = "lm")
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).

Visualization: Pie Chart:

library(scales)
## Warning: package 'scales' was built under R version 3.5.3
mpg %>% group_by(displ_grp) %>%
  summarise(pct = percent(n()/nrow(mpg))) %>%
  ggplot(aes(x = factor(1), y = pct, fill = displ_grp)) +
  geom_bar(stat = "identity") +
  coord_polar(theta = "y") + 
  theme(axis.ticks = element_blank(), 
        axis.text = element_blank(),
        axis.title = element_blank())

Visualization: Ordered Bar Plot:

mpg_by_maker <- mpg %>%
  group_by(manufacturer) %>%
  summarize(avg_cty = mean(cty, na.rm = T)) %>%
  arrange(desc(avg_cty))
mpg_by_maker$manufacturer <- factor(mpg_by_maker$manufacturer,
                                    levels = mpg_by_maker$manufacturer)
ggplot(mpg_by_maker, aes(x = manufacturer, y = avg_cty)) + 
  geom_bar(stat = "identity") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1), 
        panel.grid.major.x = element_blank())

Visualization: Density Plot:

ggplot(mpg, aes(cty)) + 
  geom_density(aes(fill = factor(cyl))) +
  labs(title = "Density plot", 
       subtitle = "City Mileage Grouped by Number of cylinders",
       caption = "Source: mpg",
       x = "City Mileage",
       fill = "# Cylinders")

Visualization: Heatmap:

ggplot(mpg, aes(x = as.factor(year), y = displ_grp)) +
  geom_tile(aes(fill = hwy), color = "white") +
  scale_fill_gradient(low = "red", high = "green") +
  theme(axis.ticks = element_blank()) +
  labs(x = "Year", y = "Engine Power")

Visualization: Faceting:

ggplot(mpg, aes(cty, hwy)) +
  geom_point() +
  facet_grid(year ~ displ_grp)
## Warning: Removed 5 rows containing missing values (geom_point).