Data visualization course

Summarization

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.1.6     ✔ dplyr   1.0.8
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

mtcars %>% 
  mutate( kml = mpg * 0.42) %>% 
  group_by(cyl) %>% 
  summarize(avg_US = mean(mpg), avg_metric = mean(kml))

## # A tibble: 3 × 3
##     cyl avg_US avg_metric
##   <dbl>  <dbl>      <dbl>
## 1     4   26.7      11.2 
## 2     6   19.7       8.29
## 3     8   15.1       6.34

mpg %>% 
  group_by(manufacturer, year) %>% 
  summarize_at(vars(cty, hwy), mean)

## # A tibble: 30 × 4
## # Groups:   manufacturer [15]
##    manufacturer  year   cty   hwy
##    <chr>        <int> <dbl> <dbl>
##  1 audi          1999  17.1  26.1
##  2 audi          2008  18.1  26.8
##  3 chevrolet     1999  15.1  21.6
##  4 chevrolet     2008  14.9  22.1
##  5 dodge         1999  13.4  18.4
##  6 dodge         2008  13.0  17.6
##  7 ford          1999  13.9  18.6
##  8 ford          2008  14.1  20.5
##  9 honda         1999  24.8  31.6
## 10 honda         2008  24    33.8
## # … with 20 more rows

change layout

mpg %>% count(class, year)%>% 
  spread(class, n)

## # A tibble: 2 × 8
##    year `2seater` compact midsize minivan pickup subcompact   suv
##   <int>     <int>   <int>   <int>   <int>  <int>      <int> <int>
## 1  1999         2      25      20       6     16         19    29
## 2  2008         3      22      21       5     17         16    33

change all characters into factors

mpg <- mpg %>% 
  mutate_if(is.character, as.factor) #if a column is a character, change to a factor

wide to long data

mpg1 <- mpg %>% 
  gather("key", "value", cty, hwy)

convert wide data to long data using pivot_longer

## Your code here. Naming choices for 1 and 2 are yours
dta <- mpg %>% 
  pivot_longer(cty:hwy, names_to = "var", values_to = "value") %>%  # Both of those are 
  # value label
   mutate(var = ifelse( var == 'cty', 'city','highway'))

ggplot(dta, aes(x = displ, y = value)) + 
  geom_point(aes(color = var))  +
  geom_smooth(aes(color = var), se = F)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

explore distribution

library(DataExplorer)
library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(naniar)
plot_histogram(riskfactors)

explore relationship/correlation

library(psych)
pairs.panels(riskfactors[,1:10])

## Warning in cor(x, y, use = "pairwise", method = method): the standard deviation
## is zero

## Warning in cor(x, y, use = "pairwise", method = method): the standard deviation
## is zero

create a individual theme

my_theme <- function(){
  theme_bw() + 
 
  theme(axis.title = element_text(size=16),
  axis.text = element_text(size=14),
  text = element_text(size = 14))
}

Explore missing values

# install.packages("naniar")
library(naniar)
# head(riskfactors)
riskfactors <- riskfactors

gg_miss_upset(riskfactors,nsets=10)

# install.packages("DataExplorer")

plot_missing(riskfactors)

# take a quick look at the data types of each column
visdat::vis_dat(riskfactors)

## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

Add statistical test

library(ggpubr)
plt <- ggplot(  data=mpg,
  mapping= aes(x =  as.factor(year),
               y = cty,
               color = as.factor(year) ) )+
  geom_boxplot() +
  geom_jitter(width=0.1)+
  labs(x = 'Year',
       y = "City mpg") +
  my_theme()+ facet_wrap( ~ manufacturer,nrow = 2)  
  
# add statistical test
  my_comparisons <- list(c('1999','2008'))
plt + stat_compare_means() +
stat_compare_means(comparisons = my_comparisons)

Add texts to dots

USArrests <- USArrests %>% rownames_to_column('State')

ggplot(USArrests, aes(
x=UrbanPop,y=Murder))+
geom_point() +
  labs(x = "Percent of population that is urban", 
       y = "Murder arrests (per 100,000)",
       caption =  "McNeil (1997). Interactive Data Analysis")+
  geom_text(aes(label=State),size=3)

Set the legend

ggplot(iris, aes(x=  Sepal.Length , fill= as.factor( Species)) ) +  #whole plot's option
  
  geom_histogram(aes(y=..density..),alpha=0.5, position="identity" , bins = 50)+
  geom_density(aes(linetype=as.factor(Species)),alpha=.1 )+    #aesthetic's option
  
  scale_fill_manual( name = "Groups",values = c("grey", "black", "skyblue"),labels = c("setosa", "versicolor" , "virginica" ))+  
  scale_linetype_manual( name = "Groups" ,values = c(1,3,5),labels = c("setosa", "versicolor" , "virginica") )+       # common legend 
  
  labs(x = "Sepal.Length", 
       y = "Density", 
       title = "")

Create a panel of plots

combine multiple plots into one

p1=ggplot(data=riskfactors,aes(x=age))+
  geom_histogram(bins = 30 )

p2=ggplot(data=riskfactors,aes(x=sex))+
  geom_bar (aes(x=sex) )

p3=ggplot(riskfactors,aes(x = education, y = bmi))+
  geom_boxplot (  )

p4=ggplot(riskfactors, aes(x = marital )) + 
  geom_bar(aes(group = education, y = (..count..)/sum(..count..),fill = education)) + 
          scale_y_continuous(labels=scales::percent) 

# install.packages("ggpubr")
library(ggpubr)
ggarrange(p1, p2, p3, p4, ncol = 2, nrow=2)

## Warning: Removed 11 rows containing non-finite values (stat_boxplot).

Plots in regression

create linear regression model

data("Boston", package = "MASS")
linear_reg <- glm(medv ~ ., data=Boston , family = gaussian())
summary(linear_reg)

## 
## Call:
## glm(formula = medv ~ ., family = gaussian(), data = Boston)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -15.595   -2.730   -0.518    1.777   26.199  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
## crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
## zn           4.642e-02  1.373e-02   3.382 0.000778 ***
## indus        2.056e-02  6.150e-02   0.334 0.738288    
## chas         2.687e+00  8.616e-01   3.118 0.001925 ** 
## nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
## rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
## age          6.922e-04  1.321e-02   0.052 0.958229    
## dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
## rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
## tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
## ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
## black        9.312e-03  2.686e-03   3.467 0.000573 ***
## lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 22.51785)
## 
##     Null deviance: 42716  on 505  degrees of freedom
## Residual deviance: 11079  on 492  degrees of freedom
## AIC: 3027.6
## 
## Number of Fisher Scoring iterations: 2

summary

knitr::kable(broom::tidy(linear_reg))

term	estimate	std.error	statistic	p.value
(Intercept)	36.4594884	5.1034588	7.1440742	0.0000000
crim	-0.1080114	0.0328650	-3.2865169	0.0010868
zn	0.0464205	0.0137275	3.3815763	0.0007781
indus	0.0205586	0.0614957	0.3343100	0.7382881
chas	2.6867338	0.8615798	3.1183809	0.0019250
nox	-17.7666112	3.8197437	-4.6512574	0.0000042
rm	3.8098652	0.4179253	9.1161402	0.0000000
age	0.0006922	0.0132098	0.0524024	0.9582293
dis	-1.4755668	0.1994547	-7.3980036	0.0000000
rad	0.3060495	0.0663464	4.6128998	0.0000051
tax	-0.0123346	0.0037605	-3.2800091	0.0011116
ptratio	-0.9527472	0.1308268	-7.2825106	0.0000000
black	0.0093117	0.0026860	3.4667926	0.0005729
lstat	-0.5247584	0.0507153	-10.3471458	0.0000000

create logistical regression

# load the Pima Indians dataset from the mlbench dataset
library(mlbench)
data(PimaIndiansDiabetes) 
# rename dataset to have shorter name because lazy
diabetes <- PimaIndiansDiabetes

logistic_reg <- glm(diabetes ~ ., data=diabetes, family = binomial)

summary(logistic_reg)

## 
## Call:
## glm(formula = diabetes ~ ., family = binomial, data = diabetes)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5566  -0.7274  -0.4159   0.7267   2.9297  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.4046964  0.7166359 -11.728  < 2e-16 ***
## pregnant     0.1231823  0.0320776   3.840 0.000123 ***
## glucose      0.0351637  0.0037087   9.481  < 2e-16 ***
## pressure    -0.0132955  0.0052336  -2.540 0.011072 *  
## triceps      0.0006190  0.0068994   0.090 0.928515    
## insulin     -0.0011917  0.0009012  -1.322 0.186065    
## mass         0.0897010  0.0150876   5.945 2.76e-09 ***
## pedigree     0.9451797  0.2991475   3.160 0.001580 ** 
## age          0.0148690  0.0093348   1.593 0.111192    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.48  on 767  degrees of freedom
## Residual deviance: 723.45  on 759  degrees of freedom
## AIC: 741.45
## 
## Number of Fisher Scoring iterations: 5

summary

knitr::kable(broom::tidy(logistic_reg))

term	estimate	std.error	statistic	p.value
(Intercept)	-8.4046964	0.7166359	-11.7279870	0.0000000
pregnant	0.1231823	0.0320776	3.8401403	0.0001230
glucose	0.0351637	0.0037087	9.4813935	0.0000000
pressure	-0.0132955	0.0052336	-2.5404160	0.0110721
triceps	0.0006190	0.0068994	0.0897131	0.9285152
insulin	-0.0011917	0.0009012	-1.3223094	0.1860652
mass	0.0897010	0.0150876	5.9453340	0.0000000
pedigree	0.9451797	0.2991475	3.1595780	0.0015800
age	0.0148690	0.0093348	1.5928584	0.1111920

Create forest plots for `coefficients` or `OR`

library(sjPlot)

## Registered S3 methods overwritten by 'effectsize':
##   method              from      
##   standardize.Surv    datawizard
##   standardize.bcplm   datawizard
##   standardize.clm2    datawizard
##   standardize.default datawizard
##   standardize.mediate datawizard
##   standardize.wbgee   datawizard
##   standardize.wbm     datawizard

plot_model(linear_reg, show.values = TRUE, value.offset = 0.5)

plot_model(logistic_reg, show.values = TRUE, value.offset = .5, vline.color = "black")

another way

library(finalfit) 

explanatory = c(  "crim"  ,  "zn"   ,   "indus"  ,   "nox"   ,  "rm"   ,   "age" ,    "dis"  ,   "rad"   ,  "tax"   ,"ptratio" ,"black"  , "lstat" )

dependent = "medv"


Boston %>%
 coefficient_plot(dependent, explanatory, table_text_size=3, 
                  title_text_size=12,
                   plot_opts=list(xlab("Beta, 95% CI"), 
                                  theme(axis.title = element_text(size=12))))

library(finalfit) 

explanatory = c(  "pregnant", "glucose" , "pressure", "triceps"  ,"insulin" , "mass"   ,  "pedigree", "age"  )

dependent = "diabetes"

diabetes %>%
 or_plot(dependent, explanatory, table_text_size=3, 
                  title_text_size=12,
                   plot_opts=list(xlab("OR, 95% CI"), 
                                  theme(axis.title = element_text(size=12))))

## Waiting for profiling to be done...
## Waiting for profiling to be done...
## Waiting for profiling to be done...

qq plot

ggqqplot( (Boston$medv))

Data visualization course

Daniel

2022-05-20

Data visualization course

Summarization

Explore missing values

Add statistical test

Add texts to dots

Set the legend

Create a panel of plots

Plots in regression

Create forest plots for `coefficients` or `OR`

Data visualization course

Daniel

2022-05-20

Data visualization course

Summarization

Explore missing values

Add statistical test

Add texts to dots

Set the legend

Create a panel of plots

Plots in regression

Create forest plots for coefficients or OR

Create forest plots for `coefficients` or `OR`