library(e1071)
library(ggpubr)
library(pixmap)
library(dplyr)
library(ggplot2)
library(gapminder)

1

Scatter Plot

cars <- mutate(mtcars, name = rownames(mtcars))
c1 <- ggplot(cars[1:20,],aes(x = wt, y = mpg,label = name))
c1 + geom_point(aes(col= factor(am)), size = 2) + 
  geom_text(aes(label=name, col = factor(am)),nudge_x = .2, nudge_y = .5) +
  labs( y="Miles per US gallon", x="Weight (1000 lbs)", title="Weight versus Fuel Efficiency", caption="Source: 1974 Motor Trend US Magazine") +
  scale_colour_discrete(name = "Transmission", labels = c("Automatic", "Manual"))

Time series

GDPaf <- ggplot(gapminder[1:12,], aes(x = year, y = gdpPercap) )
GDPaf + geom_line() + 
  labs(y="Per Capita GDP", x="Year", title="Per Capita GPD in Afghanistan", caption="Source: Gapminder Data Set") +
  theme_bw() +
  theme(panel.grid.minor = element_blank())

Slope Chart

GDP <- filter(gapminder, country %in% c("Canada", "Cuba", "Mexico", "United States") & year %in% c(1952, 1982,2002))
ggplot(GDP) + 
  geom_line(aes(x = as.factor(year), y = gdpPercap, group = country, colour = country), size = 2, alpha = 0.8) + 
  geom_point(aes(x = as.factor(year), y = gdpPercap, group = country, colour = country), size = 5, alpha = 0.8) + 
  geom_text(data = subset(GDP, year == 1952), 
            aes(x = as.factor(year), y = gdpPercap, colour = country, 
                label = scales::dollar(round(gdpPercap, 0)), 
            hjust = 1.5)) +
  geom_text(data = subset(GDP, year == 1982), 
            aes(x = as.factor(year), y = gdpPercap, colour = country, 
                label = scales::dollar(round(gdpPercap, 0)), 
             hjust = -0.5)) +
  geom_text(data = subset(GDP, year == 2002), 
            aes(x = as.factor(year), y = gdpPercap, colour = country, label = scales::dollar(round(gdpPercap, 0)), 
             hjust = -0.3)) +
  scale_colour_brewer(palette = "Set2") +
  labs(title = "A Comparrasson of GDP",  subtitle = "GDP per capita change, 1952 − 2002",
       caption = "Source: Gapminder Dataset",
       x = NULL, y = NULL, colour = NULL) +
  theme_minimal() +
  theme(axis.text.y = element_blank())

Dot and Boxplot

# Libraries
library(tidyverse)
library(hrbrthemes)
library(viridis)
# create a dataset
set.seed(2020)
data <- data.frame(
name=c( rep("Group A",500), rep("Group B",400), rep("Group C",200), rep("Group D",75), rep('Group E',100) ),
value=c( rnorm(500, 75, 5), rnorm(400, 78, 2), rnorm(200, 65, 12), rnorm(75, 70, 8), rnorm(100, 80, 5)))
g <- ggplot(data, aes(x = name, y = value ))
g + geom_boxplot(aes(fill = name)) +
  geom_jitter(shape=16, size = .5,position=position_jitter(0.15)) +
     labs(title="Test Scores Among Groups", 
        x="Group", y="Test Score") +
  scale_fill_manual(values = c("#d9cddb", "#d9dbe8", "#d3e7e6", "#def3e0","#fefcd6"))+
  theme_bw()

2

According to Swalin (Swalin, 2018), we have 2 common ways to handle missing data: Deletion and Imputation

Under deletion, we have 3 subsets:

Imputation is a prefer way when dealing with missing data. Under Imputation, we have 2 large subsets:

Time-Series: have 3 small scenarios:

General: divide into 2 groups:

3

sum(is.na(airquality))
## [1] 44
str(airquality)
## 'data.frame':    153 obs. of  6 variables:
##  $ Ozone  : int  41 36 12 18 NA 28 23 19 8 NA ...
##  $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
##  $ Wind   : num  7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
##  $ Temp   : int  67 72 74 62 56 66 65 59 61 69 ...
##  $ Month  : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...

Ozone and Solar.R contain NA.

glimpse(airquality)
## Observations: 153
## Variables: 6
## $ Ozone   <int> 41, 36, 12, 18, NA, 28, 23, 19, 8, NA, 7, 16, 11, 14, 18, 1...
## $ Solar.R <int> 190, 118, 149, 313, NA, NA, 299, 99, 19, 194, NA, 256, 290,...
## $ Wind    <dbl> 7.4, 8.0, 12.6, 11.5, 14.3, 14.9, 8.6, 13.8, 20.1, 8.6, 6.9...
## $ Temp    <int> 67, 72, 74, 62, 56, 66, 65, 59, 61, 69, 74, 69, 66, 68, 58,...
## $ Month   <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,...
## $ Day     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, ...
Air1 <- airquality
Air1$Ozone[is.na(Air1$Ozone)] <- mean(Air1$Ozone, na.rm = TRUE)
Air1$Solar.R[is.na(Air1$Solar.R)] <- mean(Air1$Solar.R, na.rm = TRUE)
glimpse(Air1)
## Observations: 153
## Variables: 6
## $ Ozone   <dbl> 41.00000, 36.00000, 12.00000, 18.00000, 42.12931, 28.00000,...
## $ Solar.R <dbl> 190.0000, 118.0000, 149.0000, 313.0000, 185.9315, 185.9315,...
## $ Wind    <dbl> 7.4, 8.0, 12.6, 11.5, 14.3, 14.9, 8.6, 13.8, 20.1, 8.6, 6.9...
## $ Temp    <int> 67, 72, 74, 62, 56, 66, 65, 59, 61, 69, 74, 69, 66, 68, 58,...
## $ Month   <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,...
## $ Day     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, ...
glimpse(airquality)
## Observations: 153
## Variables: 6
## $ Ozone   <int> 41, 36, 12, 18, NA, 28, 23, 19, 8, NA, 7, 16, 11, 14, 18, 1...
## $ Solar.R <int> 190, 118, 149, 313, NA, NA, 299, 99, 19, 194, NA, 256, 290,...
## $ Wind    <dbl> 7.4, 8.0, 12.6, 11.5, 14.3, 14.9, 8.6, 13.8, 20.1, 8.6, 6.9...
## $ Temp    <int> 67, 72, 74, 62, 56, 66, 65, 59, 61, 69, 74, 69, 66, 68, 58,...
## $ Month   <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,...
## $ Day     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, ...
Air2 <- na.omit(airquality)
glimpse(Air2)
## Observations: 111
## Variables: 6
## $ Ozone   <int> 41, 36, 12, 18, 23, 19, 8, 16, 11, 14, 18, 14, 34, 6, 30, 1...
## $ Solar.R <int> 190, 118, 149, 313, 299, 99, 19, 256, 290, 274, 65, 334, 30...
## $ Wind    <dbl> 7.4, 8.0, 12.6, 11.5, 8.6, 13.8, 20.1, 9.7, 9.2, 10.9, 13.2...
## $ Temp    <int> 67, 72, 74, 62, 65, 59, 61, 69, 66, 68, 58, 64, 66, 57, 68,...
## $ Month   <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,...
## $ Day     <int> 1, 2, 3, 4, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21...

Note: before deciding what method to use, we need to understand the nature of the missing value first then choose the method which will work best for the dataset.

References

Kang, H. (2013, May 24). The prevention and handling of the missing data. Korean Journal of Anesthesiology, 64(5), 402-406. doi:https://doi.org/10.4097/kjae.2013.64.5.402

Pairwise vs. Listwise deletion: What are they and when should I use them? (n.d.). Retrieved from IBM Support: https://www.ibm.com/support/pages/pairwise-vs-listwise-deletion-what-are-they-and-when-should-i-use-them

Swalin, A. (2018, January 30). How to Handle Missing Data. Retrieved from towards data science: https://towardsdatascience.com/how-to-handle-missing-data-8646b18db0d4