Data visualization

What does the dataset loo like?

data("iris")
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
head(iris, 5)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
data("mtcars")
str(mtcars)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
head(mtcars,5)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2

Study one numeric variable.

hist(iris$Sepal.Length, freq = F)
lines(density(iris$Sepal.Length), col = "red")

bwplot(mtcars$am ~ mtcars$mpg)

Study one categorical variable.

pie(table(mtcars$cyl))

Plot two vriables.

with(iris, plot(Sepal.Length, Sepal.Width))

xyplot(Sepal.Length ~ Sepal.Width | Species, data = iris)

How about three variables?

ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) +
  geom_point()

Let’s plot four variables.

ggplot(mtcars, aes(mpg, wt, color = gear)) +
  geom_point() +
  facet_grid(~am)

Want to plot the whole dataset.

scatterplotMatrix(mtcars[1:6], spread = F, lty.smooth = 2, 
                  main = "Scatter plot matirx")

Data wrangling

Change the wide table to the long table (such as 4 variables to 2 variables).

ldf <- df %>% gather(variable, value, -Species, -Key)
head(ldf,5)
##   Species Key     variable value
## 1  setosa   1 Sepal.Length   5.1
## 2  setosa   2 Sepal.Length   4.9
## 3  setosa   3 Sepal.Length   4.7
## 4  setosa   4 Sepal.Length   4.6
## 5  setosa   5 Sepal.Length   5.0

Change the long table back to the wide table.

sdf <- ldf %>% spread(variable, value)
head(sdf,5)
##   Species Key Petal.Length Petal.Width Sepal.Length Sepal.Width
## 1  setosa   1          1.4         0.2          5.1         3.5
## 2  setosa   2          1.4         0.2          4.9         3.0
## 3  setosa   3          1.3         0.2          4.7         3.2
## 4  setosa   4          1.5         0.2          4.6         3.1
## 5  setosa   5          1.4         0.2          5.0         3.6

Seperate one cloumn into two columns.

sedf <- df %>% separate(Sepal.Length, c("Var1", "Var2"))
head(sedf,5)
##   Var1 Var2 Sepal.Width Petal.Length Petal.Width Species Key
## 1    5    1         3.5          1.4         0.2  setosa   1
## 2    4    9         3.0          1.4         0.2  setosa   2
## 3    4    7         3.2          1.3         0.2  setosa   3
## 4    4    6         3.1          1.5         0.2  setosa   4
## 5    5 <NA>         3.6          1.4         0.2  setosa   5

Unite two columns into one column.

udf <- sedf %>% unite(Sepal.Length, Var1, Var2, sep = ".")
head(udf,5)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species Key
## 1          5.1         3.5          1.4         0.2  setosa   1
## 2          4.9         3.0          1.4         0.2  setosa   2
## 3          4.7         3.2          1.3         0.2  setosa   3
## 4          4.6         3.1          1.5         0.2  setosa   4
## 5         5.NA         3.6          1.4         0.2  setosa   5

Change the format of variable names.

string <- c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width" )
gsub(".", "-", string, fixed = TRUE)
## [1] "Sepal-Length" "Sepal-Width"  "Petal-Length" "Petal-Width"

Split one variables (such as 4 variables to 3 variables).

output <- iris %>% 
  mutate(Key=seq_len(n())) %>%                       
  gather(flower_part, Value, -Species,-Key) %>%      
  separate(flower_part,into=c("Part","Measure")) %>% 
  spread(Measure, Value)
head(output, 5)
##   Species Key  Part Length Width
## 1  setosa   1 Petal    1.4   0.2
## 2  setosa   1 Sepal    5.1   3.5
## 3  setosa   2 Petal    1.4   0.2
## 4  setosa   2 Sepal    4.9   3.0
## 5  setosa   3 Petal    1.3   0.2
output %>% ggplot(aes(x=Width, y=Length,col=Species, shape=Species)) +
  facet_grid(~Part) + geom_point()

Exchange variables of the dataset.I am sure there is a better way to do it.

con1 <- iris %>% 
  filter(Species == "setosa") %>% 
  select(-Species) %>% 
  gather(variables, setosa)
  
con2 <- iris %>% 
  filter(Species == "versicolor") %>% 
  select(-Species) %>% 
  gather(variables, versicolor)

con3 <- iris %>% 
  filter(Species == "virginica") %>% 
  select(-Species) %>% 
  gather(variables, virginica)

con4 <- cbind(con1, con2[2], con3[2])
head(con4, 5)
##      variables setosa versicolor virginica
## 1 Sepal.Length    5.1        7.0       6.3
## 2 Sepal.Length    4.9        6.4       5.8
## 3 Sepal.Length    4.7        6.9       7.1
## 4 Sepal.Length    4.6        5.5       6.3
## 5 Sepal.Length    5.0        6.5       6.5

Add new variables to the dataset.

dfdate <- mtcars %>% 
  select(1,2,4,9,10) %>% 
  mutate(Var1 = mpg/hp, 
         time = ymd(today()))
head(dfdate, 5)
##    mpg cyl  hp am gear      Var1       time
## 1 21.0   6 110  1    4 0.1909091 2018-11-20
## 2 21.0   6 110  1    4 0.1909091 2018-11-20
## 3 22.8   4  93  1    4 0.2451613 2018-11-20
## 4 21.4   6 110  0    3 0.1945455 2018-11-20
## 5 18.7   8 175  0    3 0.1068571 2018-11-20

Manipulate the formate of date.

dfdate$wday <- wday(dfdate$time, label = TRUE)
head(dfdate, 5)
##    mpg cyl  hp am gear      Var1       time wday
## 1 21.0   6 110  1    4 0.1909091 2018-11-20  Tue
## 2 21.0   6 110  1    4 0.1909091 2018-11-20  Tue
## 3 22.8   4  93  1    4 0.2451613 2018-11-20  Tue
## 4 21.4   6 110  0    3 0.1945455 2018-11-20  Tue
## 5 18.7   8 175  0    3 0.1068571 2018-11-20  Tue

Factorize categorical variables.

dfdate$am.lbl <- factor(dfdate$am, labels = c("Automatic", "Manual"))
head(dfdate, 5)
##    mpg cyl  hp am gear      Var1       time wday    am.lbl
## 1 21.0   6 110  1    4 0.1909091 2018-11-20  Tue    Manual
## 2 21.0   6 110  1    4 0.1909091 2018-11-20  Tue    Manual
## 3 22.8   4  93  1    4 0.2451613 2018-11-20  Tue    Manual
## 4 21.4   6 110  0    3 0.1945455 2018-11-20  Tue Automatic
## 5 18.7   8 175  0    3 0.1068571 2018-11-20  Tue Automatic
aggregate(mpg~am.lbl, data = dfdate, FUN = mean)
##      am.lbl      mpg
## 1 Automatic 17.14737
## 2    Manual 24.39231

Export the dateset in csv.

final <- mtcars %>% 
  select(1,2,4,9,10) %>% 
  group_by(am, gear) %>% 
  summarise(Mean = mean(hp),
                  SD = sd(hp)) %>% 
  arrange(-Mean)
head(final,5)
## # A tibble: 4 x 4
## # Groups:   am [2]
##      am  gear  Mean    SD
##   <dbl> <dbl> <dbl> <dbl>
## 1    1.    5. 196.  103. 
## 2    0.    3. 176.   47.7
## 3    0.    4. 101.   29.0
## 4    1.    4.  83.9  24.2
write.csv(final, file = "final.csv",row.names=FALSE)

Subset

Slice a part of dataset.

mtcars %>% 
  slice(1:5)
##    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## 2 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## 3 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## 4 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## 5 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2

Filter across the whole dataset.

mtcars %>% 
  filter_all(any_vars(. > 150)) %>% 
  head(5)
##    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## 2 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## 3 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## 4 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## 5 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

Filter based on required conditions.

mtcars %>% 
  filter(cyl == 6, mpg >= 20)
##    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## 2 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## 3 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
mtcars %>% 
  filter(cyl %in% c(4, 6)) %>% 
  head(5)
##    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## 2 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## 3 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## 4 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## 5 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

Filter using regex.

mtcars %>% 
  rownames_to_column %>% 
  filter(str_detect(rowname, "Merc|Toy"))
##          rowname  mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1      Merc 240D 24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## 2       Merc 230 22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## 3       Merc 280 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## 4      Merc 280C 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## 5     Merc 450SE 16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## 6     Merc 450SL 17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## 7    Merc 450SLC 15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## 8 Toyota Corolla 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## 9  Toyota Corona 21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
mtcars %>% 
  rownames_to_column %>% 
  filter(str_detect(rowname, "^L")) 
##               rowname  mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1 Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## 2        Lotus Europa 30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
mtcars %>% 
  rownames_to_column %>% 
  filter(str_detect(rowname, "L$")) 
##          rowname  mpg cyl  disp  hp drat   wt qsec vs am gear carb
## 1     Merc 450SL 17.3   8 275.8 180 3.07 3.73 17.6  0  0    3    3
## 2 Ford Pantera L 15.8   8 351.0 264 4.22 3.17 14.5  0  1    5    4
mtcars %>% 
  rownames_to_column %>% 
  filter(str_detect(rowname, "\\d")) 
##           rowname  mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1       Mazda RX4 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## 2   Mazda RX4 Wag 21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## 3      Datsun 710 22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## 4  Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## 5      Duster 360 14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## 6       Merc 240D 24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## 7        Merc 230 22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## 8        Merc 280 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## 9       Merc 280C 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## 10     Merc 450SE 16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## 11     Merc 450SL 17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## 12    Merc 450SLC 15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## 13       Fiat 128 32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## 14     Camaro Z28 13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## 15      Fiat X1-9 27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## 16  Porsche 914-2 26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## 17     Volvo 142E 21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
mtcars %>% 
  rownames_to_column %>% 
  filter(!str_detect(rowname, "\\d")) 
##                rowname  mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## 1    Hornet Sportabout 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## 2              Valiant 18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## 3   Cadillac Fleetwood 10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## 4  Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## 5    Chrysler Imperial 14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## 6          Honda Civic 30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## 7       Toyota Corolla 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## 8        Toyota Corona 21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## 9     Dodge Challenger 15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## 10         AMC Javelin 15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## 11    Pontiac Firebird 19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## 12        Lotus Europa 30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## 13      Ford Pantera L 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## 14        Ferrari Dino 19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## 15       Maserati Bora 15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8

Highlight the subset when plotting.

ggplot(mtcars, aes(wt, mpg, colour = cyl, use_group_by=FALSE)) +
  geom_point() +
  gghighlight(disp > 460,  label_key = am) + 
  scale_x_discrete(name ="wt")

Statistical analysis

R has a very rich statistical library.

model1 <- lm(mpg~., data = mtcars)
summary(model1)
## 
## Call:
## lm(formula = mpg ~ ., data = mtcars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4506 -1.6044 -0.1196  1.2193  4.6271 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 12.30337   18.71788   0.657   0.5181  
## cyl         -0.11144    1.04502  -0.107   0.9161  
## disp         0.01334    0.01786   0.747   0.4635  
## hp          -0.02148    0.02177  -0.987   0.3350  
## drat         0.78711    1.63537   0.481   0.6353  
## wt          -3.71530    1.89441  -1.961   0.0633 .
## qsec         0.82104    0.73084   1.123   0.2739  
## vs           0.31776    2.10451   0.151   0.8814  
## am           2.52023    2.05665   1.225   0.2340  
## gear         0.65541    1.49326   0.439   0.6652  
## carb        -0.19942    0.82875  -0.241   0.8122  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared:  0.869,  Adjusted R-squared:  0.8066 
## F-statistic: 13.93 on 10 and 21 DF,  p-value: 3.793e-07
par(mfrow = c(2,2))
plot(model1)

Modeling

Build regression mdoels by trying every two possible variables in the dataset.

setdiff(names(mtcars), "mpg") %>% 
  combn(2, paste, collapse = " + ") %>%
  structure(., names = .) %>% 
  map(~ formula(paste("mpg ~ ", .x))) %>% 
  map(lm, data = mtcars) %>% 
  map_df(tidy, .id = "predictors") %>% 
  arrange(-std.error) %>% 
  head(20)
##     predictors        term   estimate std.error  statistic      p.value
## 1  qsec + carb (Intercept)  19.139362 13.847403  1.3821625 1.774758e-01
## 2    qsec + vs (Intercept)  26.182530 11.625525  2.2521589 3.205404e-02
## 3    hp + qsec (Intercept)  48.323705 11.103306  4.3521906 1.526469e-04
## 4  qsec + gear (Intercept) -30.710845  9.670228 -3.1758139 3.529643e-03
## 5   cyl + qsec (Intercept)  47.381699  8.806606  5.3802454 8.819050e-06
## 6  drat + qsec (Intercept) -27.839918  8.299227 -3.3545196 2.229131e-03
## 7   cyl + drat (Intercept)  28.724665  7.592143  3.7834726 7.178250e-04
## 8    drat + wt (Intercept)  30.290370  7.317878  4.1392285 2.737824e-04
## 9  disp + qsec (Intercept)  25.504508  7.184094  3.5501356 1.335871e-03
## 10   drat + am (Intercept)  -1.949883  7.073285 -0.2756687 7.847574e-01
## 11 disp + drat (Intercept)  21.844880  6.747971  3.2372516 3.016655e-03
## 12   qsec + am (Intercept) -18.889281  6.596973 -2.8633255 7.710583e-03
## 13   am + gear (Intercept)  16.864675  6.511673  2.5899142 1.486255e-02
## 14 drat + gear (Intercept)  -7.537474  5.580198 -1.3507539 1.872206e-01
## 15   wt + qsec (Intercept)  19.746223  5.252062  3.7597088 7.650466e-04
## 16   wt + gear (Intercept)  38.915653  5.097397  7.6344173 2.037781e-08
## 17   hp + drat (Intercept)  10.789861  5.077752  2.1249289 4.223773e-02
## 18  cyl + gear (Intercept)  34.659497  4.936925  7.0204633 1.013557e-07
## 19   drat + vs (Intercept)  -1.825317  4.902758 -0.3723041 7.123734e-01
## 20 disp + gear (Intercept)  29.104942  4.485919  6.4880668 4.214144e-07

K-mean cluster.

iriscaled <- scale(iris[,-5])
fitk <- kmeans(iriscaled, 3, nstart = 100)
plot(iris, col = fitk$cluster)

fitk$size
## [1] 50 47 53
table(iris$Species, fitk$cluster)
##             
##               1  2  3
##   setosa     50  0  0
##   versicolor  0 11 39
##   virginica   0 36 14

References

http://www.win-vector.com/blog/2018/10/faceted-graphs-with-cdata-and-ggplot2/

https://sebastiansauer.github.io/dplyr_filter/

https://www.youtube.com/channel/UCCjGMIrOwBOi6ogrDdjtk9w

https://www.youtube.com/channel/UCctc3RvC6n5dJXP_NR7wfjQ

Copyright @ Stewart Li stewardli8@msn.com