Data visualization course

Summarization

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.1.6     ✔ dplyr   1.0.8
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
mtcars %>% 
  mutate( kml = mpg * 0.42) %>% 
  group_by(cyl) %>% 
  summarize(avg_US = mean(mpg), avg_metric = mean(kml))
## # A tibble: 3 × 3
##     cyl avg_US avg_metric
##   <dbl>  <dbl>      <dbl>
## 1     4   26.7      11.2 
## 2     6   19.7       8.29
## 3     8   15.1       6.34
mpg %>% 
  group_by(manufacturer, year) %>% 
  summarize_at(vars(cty, hwy), mean)
## # A tibble: 30 × 4
## # Groups:   manufacturer [15]
##    manufacturer  year   cty   hwy
##    <chr>        <int> <dbl> <dbl>
##  1 audi          1999  17.1  26.1
##  2 audi          2008  18.1  26.8
##  3 chevrolet     1999  15.1  21.6
##  4 chevrolet     2008  14.9  22.1
##  5 dodge         1999  13.4  18.4
##  6 dodge         2008  13.0  17.6
##  7 ford          1999  13.9  18.6
##  8 ford          2008  14.1  20.5
##  9 honda         1999  24.8  31.6
## 10 honda         2008  24    33.8
## # … with 20 more rows
  • change layout
mpg %>% count(class, year)%>% 
  spread(class, n) 
## # A tibble: 2 × 8
##    year `2seater` compact midsize minivan pickup subcompact   suv
##   <int>     <int>   <int>   <int>   <int>  <int>      <int> <int>
## 1  1999         2      25      20       6     16         19    29
## 2  2008         3      22      21       5     17         16    33
  • change all characters into factors
mpg <- mpg %>% 
  mutate_if(is.character, as.factor) #if a column is a character, change to a factor
  • wide to long data
mpg1 <- mpg %>% 
  gather("key", "value", cty, hwy)
  • convert wide data to long data using pivot_longer
## Your code here. Naming choices for 1 and 2 are yours
dta <- mpg %>% 
  pivot_longer(cty:hwy, names_to = "var", values_to = "value") %>%  # Both of those are 
  # value label
   mutate(var = ifelse( var == 'cty', 'city','highway'))

ggplot(dta, aes(x = displ, y = value)) + 
  geom_point(aes(color = var))  +
  geom_smooth(aes(color = var), se = F)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

  • explore distribution
library(DataExplorer)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(naniar)
plot_histogram(riskfactors)

  • explore relationship/correlation
library(psych)
pairs.panels(riskfactors[,1:10])
## Warning in cor(x, y, use = "pairwise", method = method): the standard deviation
## is zero

## Warning in cor(x, y, use = "pairwise", method = method): the standard deviation
## is zero

  • create a individual theme
my_theme <- function(){
  theme_bw() + 
 
  theme(axis.title = element_text(size=16),
  axis.text = element_text(size=14),
  text = element_text(size = 14))
}

Explore missing values

# install.packages("naniar")
library(naniar)
# head(riskfactors)
riskfactors <- riskfactors
gg_miss_upset(riskfactors,nsets=10)

# install.packages("DataExplorer")

plot_missing(riskfactors)

# take a quick look at the data types of each column
visdat::vis_dat(riskfactors)
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

Add statistical test

library(ggpubr)
plt <- ggplot(  data=mpg,
  mapping= aes(x =  as.factor(year),
               y = cty,
               color = as.factor(year) ) )+
  geom_boxplot() +
  geom_jitter(width=0.1)+
  labs(x = 'Year',
       y = "City mpg") +
  my_theme()+ facet_wrap( ~ manufacturer,nrow = 2)  
  
# add statistical test
  my_comparisons <- list(c('1999','2008'))
plt + stat_compare_means() +
stat_compare_means(comparisons = my_comparisons)

Add texts to dots

USArrests <- USArrests %>% rownames_to_column('State')

ggplot(USArrests, aes(
x=UrbanPop,y=Murder))+
geom_point() +
  labs(x = "Percent of population that is urban", 
       y = "Murder arrests (per 100,000)",
       caption =  "McNeil (1997). Interactive Data Analysis")+
  geom_text(aes(label=State),size=3)

Set the legend

ggplot(iris, aes(x=  Sepal.Length , fill= as.factor( Species)) ) +  #whole plot's option
  
  geom_histogram(aes(y=..density..),alpha=0.5, position="identity" , bins = 50)+
  geom_density(aes(linetype=as.factor(Species)),alpha=.1 )+    #aesthetic's option
  
  scale_fill_manual( name = "Groups",values = c("grey", "black", "skyblue"),labels = c("setosa", "versicolor" , "virginica" ))+  
  scale_linetype_manual( name = "Groups" ,values = c(1,3,5),labels = c("setosa", "versicolor" , "virginica") )+       # common legend 
  
  labs(x = "Sepal.Length", 
       y = "Density", 
       title = "") 

Create a panel of plots

  • combine multiple plots into one
p1=ggplot(data=riskfactors,aes(x=age))+
  geom_histogram(bins = 30 )

p2=ggplot(data=riskfactors,aes(x=sex))+
  geom_bar (aes(x=sex) )

p3=ggplot(riskfactors,aes(x = education, y = bmi))+
  geom_boxplot (  )

p4=ggplot(riskfactors, aes(x = marital )) + 
  geom_bar(aes(group = education, y = (..count..)/sum(..count..),fill = education)) + 
          scale_y_continuous(labels=scales::percent) 

# install.packages("ggpubr")
library(ggpubr)
ggarrange(p1, p2, p3, p4, ncol = 2, nrow=2)
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).

Plots in regression

  • create linear regression model
data("Boston", package = "MASS")
linear_reg <- glm(medv ~ ., data=Boston , family = gaussian())
summary(linear_reg)
## 
## Call:
## glm(formula = medv ~ ., family = gaussian(), data = Boston)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -15.595   -2.730   -0.518    1.777   26.199  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
## crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
## zn           4.642e-02  1.373e-02   3.382 0.000778 ***
## indus        2.056e-02  6.150e-02   0.334 0.738288    
## chas         2.687e+00  8.616e-01   3.118 0.001925 ** 
## nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
## rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
## age          6.922e-04  1.321e-02   0.052 0.958229    
## dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
## rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
## tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
## ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
## black        9.312e-03  2.686e-03   3.467 0.000573 ***
## lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 22.51785)
## 
##     Null deviance: 42716  on 505  degrees of freedom
## Residual deviance: 11079  on 492  degrees of freedom
## AIC: 3027.6
## 
## Number of Fisher Scoring iterations: 2
  • summary
knitr::kable(broom::tidy(linear_reg))
term estimate std.error statistic p.value
(Intercept) 36.4594884 5.1034588 7.1440742 0.0000000
crim -0.1080114 0.0328650 -3.2865169 0.0010868
zn 0.0464205 0.0137275 3.3815763 0.0007781
indus 0.0205586 0.0614957 0.3343100 0.7382881
chas 2.6867338 0.8615798 3.1183809 0.0019250
nox -17.7666112 3.8197437 -4.6512574 0.0000042
rm 3.8098652 0.4179253 9.1161402 0.0000000
age 0.0006922 0.0132098 0.0524024 0.9582293
dis -1.4755668 0.1994547 -7.3980036 0.0000000
rad 0.3060495 0.0663464 4.6128998 0.0000051
tax -0.0123346 0.0037605 -3.2800091 0.0011116
ptratio -0.9527472 0.1308268 -7.2825106 0.0000000
black 0.0093117 0.0026860 3.4667926 0.0005729
lstat -0.5247584 0.0507153 -10.3471458 0.0000000
  • create logistical regression
# load the Pima Indians dataset from the mlbench dataset
library(mlbench)
data(PimaIndiansDiabetes) 
# rename dataset to have shorter name because lazy
diabetes <- PimaIndiansDiabetes

logistic_reg <- glm(diabetes ~ ., data=diabetes, family = binomial)

summary(logistic_reg)
## 
## Call:
## glm(formula = diabetes ~ ., family = binomial, data = diabetes)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5566  -0.7274  -0.4159   0.7267   2.9297  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.4046964  0.7166359 -11.728  < 2e-16 ***
## pregnant     0.1231823  0.0320776   3.840 0.000123 ***
## glucose      0.0351637  0.0037087   9.481  < 2e-16 ***
## pressure    -0.0132955  0.0052336  -2.540 0.011072 *  
## triceps      0.0006190  0.0068994   0.090 0.928515    
## insulin     -0.0011917  0.0009012  -1.322 0.186065    
## mass         0.0897010  0.0150876   5.945 2.76e-09 ***
## pedigree     0.9451797  0.2991475   3.160 0.001580 ** 
## age          0.0148690  0.0093348   1.593 0.111192    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.48  on 767  degrees of freedom
## Residual deviance: 723.45  on 759  degrees of freedom
## AIC: 741.45
## 
## Number of Fisher Scoring iterations: 5
  • summary
knitr::kable(broom::tidy(logistic_reg))
term estimate std.error statistic p.value
(Intercept) -8.4046964 0.7166359 -11.7279870 0.0000000
pregnant 0.1231823 0.0320776 3.8401403 0.0001230
glucose 0.0351637 0.0037087 9.4813935 0.0000000
pressure -0.0132955 0.0052336 -2.5404160 0.0110721
triceps 0.0006190 0.0068994 0.0897131 0.9285152
insulin -0.0011917 0.0009012 -1.3223094 0.1860652
mass 0.0897010 0.0150876 5.9453340 0.0000000
pedigree 0.9451797 0.2991475 3.1595780 0.0015800
age 0.0148690 0.0093348 1.5928584 0.1111920

Create forest plots for coefficients or OR

library(sjPlot)
## Registered S3 methods overwritten by 'effectsize':
##   method              from      
##   standardize.Surv    datawizard
##   standardize.bcplm   datawizard
##   standardize.clm2    datawizard
##   standardize.default datawizard
##   standardize.mediate datawizard
##   standardize.wbgee   datawizard
##   standardize.wbm     datawizard
plot_model(linear_reg, show.values = TRUE, value.offset = 0.5)

plot_model(logistic_reg, show.values = TRUE, value.offset = .5, vline.color = "black")

another way

library(finalfit) 

explanatory = c(  "crim"  ,  "zn"   ,   "indus"  ,   "nox"   ,  "rm"   ,   "age" ,    "dis"  ,   "rad"   ,  "tax"   ,"ptratio" ,"black"  , "lstat" )

dependent = "medv"


Boston %>%
 coefficient_plot(dependent, explanatory, table_text_size=3, 
                  title_text_size=12,
                   plot_opts=list(xlab("Beta, 95% CI"), 
                                  theme(axis.title = element_text(size=12))))

library(finalfit) 

explanatory = c(  "pregnant", "glucose" , "pressure", "triceps"  ,"insulin" , "mass"   ,  "pedigree", "age"  )

dependent = "diabetes"

diabetes %>%
 or_plot(dependent, explanatory, table_text_size=3, 
                  title_text_size=12,
                   plot_opts=list(xlab("OR, 95% CI"), 
                                  theme(axis.title = element_text(size=12))))
## Waiting for profiling to be done...
## Waiting for profiling to be done...
## Waiting for profiling to be done...

  • qq plot
ggqqplot( (Boston$medv))