tidyverse
(Wickham 2017).library(car)
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages -------------------------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
## recode(): dplyr, car
## some(): purrr, car
da <- mtcars
da %>% glimpse()
## Observations: 32
## Variables: 11
## $ mpg <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2, 17.8, 16.4, ...
## $ cyl <dbl> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, ...
## $ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 140.8, 167.6, 16...
## $ hp <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, 180, 180, 205, ...
## $ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92, 3.07, ...
## $ wt <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3.150, 3.440, 3....
## $ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 22.90, 18.30, 18...
## $ vs <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, ...
## $ am <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, ...
## $ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, ...
## $ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, ...
#summary(da)
da %>%
ggplot(aes(hp,mpg)) +
geom_point() +
geom_smooth(method = "lm",se = F)
la <- lm(mpg~hp,data = da)
broom::augment(la) %>%
ggplot(aes(.fitted,.resid)) +
geom_point() +
geom_hline(yintercept = 0,col="red")
broom::tidy(la)
## term estimate std.error statistic p.value
## 1 (Intercept) 30.09886054 1.6339210 18.421246 6.642736e-18
## 2 hp -0.06822828 0.0101193 -6.742389 1.787835e-07
broom::glance(la) %>% dplyr::select(1,2,AIC) #model quality
## r.squared adj.r.squared AIC
## 1 0.6024373 0.5891853 181.2386
csv
files with readr
package (Wickham, Hester, and Francois 2017).ad <- readr::read_csv("http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv") %>%
dplyr::select(-X1) %>%
dplyr::rename_all(funs(stringr::str_to_lower(.)))
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## TV = col_double(),
## radio = col_double(),
## newspaper = col_double(),
## sales = col_double()
## )
lm(sales ~ newspaper,data = ad) %>% broom::glance() %>% dplyr::select(1,2,AIC) #model quality
## r.squared adj.r.squared AIC
## 1 0.05212045 0.04733317 1222.671
lm(sales ~ radio,data = ad) %>% broom::glance() %>% dplyr::select(1,2,AIC) #model quality
## r.squared adj.r.squared AIC
## 1 0.3320325 0.3286589 1152.674
lm(sales ~ tv,data = ad) %>% broom::glance() %>% dplyr::select(1,2,AIC) #model quality
## r.squared adj.r.squared AIC
## 1 0.6118751 0.6099148 1044.091
ad %>%
ggplot(aes(tv,sales)) +
geom_point() +
geom_smooth(method = "lm",se = F)
lm(sales ~ newspaper + radio + tv,data = ad) %>% broom::glance() %>% dplyr::select(1,2,AIC) #model quality
## r.squared adj.r.squared AIC
## 1 0.8972106 0.8956373 782.3622
# significant correlation and non-significant effect of newspaper to explain the outcome
cor(ad)
## tv radio newspaper sales
## tv 1.00000000 0.05480866 0.05664787 0.7822244
## radio 0.05480866 1.00000000 0.35410375 0.5762226
## newspaper 0.05664787 0.35410375 1.00000000 0.2282990
## sales 0.78222442 0.57622257 0.22829903 1.0000000
#PerformanceAnalytics::chart.Correlation(ad)
# proved that newspaper have low effect and improves the AIC
lm(sales ~ radio + tv,data = ad) %>% broom::glance() %>% dplyr::select(1,2,AIC) #model quality
## r.squared adj.r.squared AIC
## 1 0.8971943 0.8961505 780.3941
broom::augment(lm(sales ~ radio + tv,data = ad)) %>%
ggplot(aes(.fitted,.resid)) +
geom_point() +
geom_hline(yintercept = 0,col="red")
Prestige
dataset from the car
package (Fox and Weisberg 2017).#?car::Prestige
pr <- car::Prestige %>% rownames_to_column()
head(pr) #%>% dplyr::count(type)
## rowname education income women prestige census type
## 1 gov.administrators 13.11 12351 11.16 68.8 1113 prof
## 2 general.managers 12.26 25879 4.02 69.1 1130 prof
## 3 accountants 12.77 9271 15.70 63.4 1171 prof
## 4 purchasing.officers 11.42 8865 9.11 56.8 1175 prof
## 5 chemists 14.62 8403 11.68 73.5 2111 prof
## 6 physicists 15.64 11030 5.13 77.6 2113 prof
?car::Prestige
, a problem to address in this dataset would be to test the validity of the mid-1960 prestige score for occupation with data recorded in 1970.pr1 <- pr %>% dplyr::select(-census,-rowname)
pr1 %>%
gather(var,val,-prestige,-type) %>%
ggplot(aes(val)) +
geom_histogram() +
facet_grid(~var,scales = "free")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pr1 %>% #dplyr::select(type) %>%
ggplot(aes(type)) +
geom_bar()
pr1 %>%
gather(var,val,-type) %>%
ggplot(aes(type,val)) +
geom_boxplot(aes(fill=type),position=position_dodge(0.8)) +
facet_wrap(~var,ncol = 4,scales = "free")
#cor(pr1 %>% dplyr::select(-type))
PerformanceAnalytics::chart.Correlation(pr1 %>% dplyr::select(-type),method = "pearson",histogram = F)
p1 <- pr1 %>%
ggplot(aes(education,prestige)) +
geom_point() +
geom_smooth(method = "lm",se = F)
la <- lm(prestige~education,data = pr1)
p2 <- broom::augment(la) %>%
ggplot(aes(.fitted,.resid)) +
geom_point() +
geom_hline(yintercept = 0,col="red")
Rmisc::multiplot(p1,p2,cols = 2)
broom::tidy(la)
## term estimate std.error statistic p.value
## 1 (Intercept) -10.731982 3.6770883 -2.918609 4.343443e-03
## 2 education 5.360878 0.3319882 16.147796 1.286264e-29
broom::glance(la) %>% dplyr::select(1,2,AIC) #model quality
## r.squared adj.r.squared AIC
## 1 0.7228007 0.7200287 744.0053
p1 <- pr1 %>%
ggplot(aes(income,prestige)) +
geom_point() +
geom_smooth(method = "lm",se = F)
la <- lm(prestige~income,data = pr1)
p2 <- broom::augment(la) %>%
ggplot(aes(.fitted,.resid)) +
geom_point() +
geom_hline(yintercept = 0,col="red")
Rmisc::multiplot(p1,p2,cols = 2)
broom::tidy(la)
## term estimate std.error statistic p.value
## 1 (Intercept) 27.141176368 2.2677036186 11.96857 5.135229e-21
## 2 income 0.002896799 0.0002833245 10.22432 3.192004e-17
broom::glance(la) %>% dplyr::select(1,2,AIC) #model quality
## r.squared adj.r.squared AIC
## 1 0.5110901 0.506201 801.8844
p1 <- pr1 %>%
ggplot(aes(women,prestige)) +
geom_point() +
geom_smooth(method = "lm",se = F)
la <- lm(prestige~women,data = pr1)
p2 <- broom::augment(la) %>%
ggplot(aes(.fitted,.resid)) +
geom_point() +
geom_hline(yintercept = 0,col="red")
Rmisc::multiplot(p1,p2,cols = 2)
broom::tidy(la)
## term estimate std.error statistic p.value
## 1 (Intercept) 48.69299929 2.30760232 21.101122 1.299634e-38
## 2 women -0.06417284 0.05384914 -1.191715 2.361936e-01
broom::glance(la) %>% dplyr::select(1,2,AIC) #model quality
## r.squared adj.r.squared AIC
## 1 0.01400298 0.00414301 873.4348
pr1 %>% dplyr::count(type)
## # A tibble: 4 x 2
## type n
## <fctr> <int>
## 1 bc 44
## 2 prof 31
## 3 wc 23
## 4 <NA> 4
# create dummy variable
pr1_x <- pr1 %>%
mutate(type_d1=if_else(type=="bc",1,0),
type_d2=if_else(type=="prof",1,0))
pr1_x %>% dplyr::count(type,type_d1,type_d2)
## # A tibble: 4 x 4
## type type_d1 type_d2 n
## <fctr> <dbl> <dbl> <int>
## 1 bc 1 0 44
## 2 prof 0 1 31
## 3 wc 0 0 23
## 4 <NA> NA NA 4
pr2 <- lm(prestige ~ type_d1 + type_d2 + women + income + education,data = pr1_x)
pr2 %>% broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) -3.730975213 7.2113023627 -0.5173788 6.061335e-01
## 2 type_d1 2.917072023 2.6653960921 1.0944235 2.766264e-01
## 3 type_d2 8.822269036 2.7937638551 3.1578435 2.150068e-03
## 4 women 0.006443365 0.0303780676 0.2121058 8.324937e-01
## 5 income 0.001042799 0.0002622864 3.9758032 1.394941e-04
## 6 education 3.662355701 0.6458300089 5.6707735 1.626716e-07
pr2 %>% broom::glance() %>% dplyr::select(1,2,AIC)
## r.squared adj.r.squared AIC
## 1 0.8349381 0.8259674 670.9672
pr2 <- lm(prestige ~ women + income + education,data = pr1)
pr2 %>% broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) -6.794334203 3.2390886463 -2.0976067 3.851306e-02
## 2 women -0.008905157 0.0304070576 -0.2928648 7.702447e-01
## 3 income 0.001313560 0.0002777812 4.7287593 7.579372e-06
## 4 education 4.186637275 0.3887012613 10.7708353 2.590069e-18
pr2 %>% broom::glance() %>% dplyr::select(1,2,AIC)
## r.squared adj.r.squared AIC
## 1 0.7981775 0.7919992 715.6358
pr2 <- lm(prestige ~ income + education,data = pr1)
pr2 %>% broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) -6.847778720 3.2189770963 -2.127315 3.587840e-02
## 2 income 0.001361166 0.0002242121 6.070886 2.355196e-08
## 3 education 4.137444384 0.3489120233 11.858131 1.032202e-20
pr2 %>% broom::glance() %>% dplyr::select(1,2,AIC)
## r.squared adj.r.squared AIC
## 1 0.7980008 0.7939201 713.7251
prestige
are income
and education
.ggrepel
package (Slowikowski 2017).#pr = Prestige[ , c(4,1,2,3,6)]
ggplot(pr, aes(education, women, color = type)) +
geom_point() +
ggrepel::geom_text_repel(aes(label = ifelse(women > 50, rowname, ''))) +
labs(title="Occupations with more than 50% of women in 1970")
ggplot(pr, aes(education, prestige, color = type)) + geom_point() +
ggrepel::geom_text_repel(aes(label = ifelse(women > 50, rowname, ''))) +
labs(title="Occupations with more than 50% of women in 1970")
ggplot(pr, aes(education, prestige, color = type)) + geom_point() +
ggrepel::geom_text_repel(aes(label = ifelse(prestige > 75 | prestige < 25, rowname, ''))) +
labs(title="Occupations with a prestige lower than 25% and higher than 75% in 1970")
bib
file with write_bib()
from the knitr
package (Xie 2017).knitr::write_bib(c("car", "tidyverse", "readr", "ggplot2", "broom","knitr","devtools","ggrepel"),
file = "lmml.bib")
bash
.less lmml.bib
## @Manual{R-broom,
## title = {broom: Convert Statistical Analysis Objects into Tidy Data Frames},
## author = {David Robinson},
## year = {2017},
## note = {R package version 0.4.2},
## url = {https://CRAN.R-project.org/package=broom},
## }
## @Manual{R-car,
## title = {car: Companion to Applied Regression},
## author = {John Fox and Sanford Weisberg},
## year = {2017},
## note = {R package version 2.1-5},
## url = {https://CRAN.R-project.org/package=car},
## }
## @Manual{R-devtools,
## title = {devtools: Tools to Make Developing R Packages Easier},
## author = {Hadley Wickham and Winston Chang},
## year = {2017},
## note = {R package version 1.13.2},
## url = {https://CRAN.R-project.org/package=devtools},
## }
## @Manual{R-ggplot2,
## title = {ggplot2: Create Elegant Data Visualisations Using the Grammar of Graphics},
## author = {Hadley Wickham and Winston Chang},
## note = {http://ggplot2.tidyverse.org, https://github.com/tidyverse/ggplot2},
## year = {2017},
## }
## @Manual{R-ggrepel,
## title = {ggrepel: Repulsive Text and Label Geoms for 'ggplot2'},
## author = {Kamil Slowikowski},
## note = {R package version 0.6.9},
## url = {http://github.com/slowkow/ggrepel},
## year = {2017},
## }
## @Manual{R-knitr,
## title = {knitr: A General-Purpose Package for Dynamic Report Generation in R},
## author = {Yihui Xie},
## year = {2017},
## note = {R package version 1.16},
## url = {https://CRAN.R-project.org/package=knitr},
## }
## @Manual{R-readr,
## title = {readr: Read Rectangular Text Data},
## author = {Hadley Wickham and Jim Hester and Romain Francois},
## year = {2017},
## note = {R package version 1.1.1},
## url = {https://CRAN.R-project.org/package=readr},
## }
## @Manual{R-tidyverse,
## title = {tidyverse: Easily Install and Load 'Tidyverse' Packages},
## author = {Hadley Wickham},
## year = {2017},
## note = {R package version 1.1.1},
## url = {https://CRAN.R-project.org/package=tidyverse},
## }
devtools::session_info()
## Session info ----------------------------------------------------------------------------
## setting value
## version R version 3.4.2 (2017-09-28)
## system x86_64, linux-gnu
## ui X11
## language en_US
## collate en_US.UTF-8
## tz America/Lima
## date 2017-12-12
## Packages --------------------------------------------------------------------------------
## package * version date source
## assertthat 0.2.0 2017-04-11 CRAN (R 3.4.0)
## backports 1.1.0 2017-05-22 CRAN (R 3.4.1)
## base * 3.4.2 2017-09-29 local
## bindr 0.1 2016-11-13 cran (@0.1)
## bindrcpp * 0.2 2017-06-17 CRAN (R 3.4.1)
## broom 0.4.2 2017-02-13 CRAN (R 3.4.0)
## car * 2.1-5 2017-07-04 CRAN (R 3.4.1)
## cellranger 1.1.0 2016-07-27 CRAN (R 3.4.0)
## colorspace 1.3-2 2016-12-14 CRAN (R 3.4.0)
## compiler 3.4.2 2017-09-29 local
## curl 2.8.1 2017-07-21 CRAN (R 3.4.1)
## datasets * 3.4.2 2017-09-29 local
## devtools 1.13.2 2017-06-02 CRAN (R 3.4.1)
## digest 0.6.12 2017-01-27 CRAN (R 3.4.0)
## dplyr * 0.7.2 2017-07-20 CRAN (R 3.4.1)
## evaluate 0.10.1 2017-06-24 CRAN (R 3.4.1)
## forcats 0.2.0 2017-01-23 CRAN (R 3.4.0)
## foreign 0.8-69 2017-06-21 CRAN (R 3.4.1)
## ggplot2 * 2.2.1.9000 2017-10-24 Github (tidyverse/ggplot2@ffb40f3)
## ggrepel 0.6.9 2017-05-05 Github (slowkow/ggrepel@d21b468)
## glue 1.1.1 2017-06-21 CRAN (R 3.4.1)
## graphics * 3.4.2 2017-09-29 local
## grDevices * 3.4.2 2017-09-29 local
## grid 3.4.2 2017-09-29 local
## gtable 0.2.0 2016-02-26 CRAN (R 3.4.0)
## haven 1.1.0 2017-07-09 CRAN (R 3.4.1)
## hms 0.3 2016-11-22 CRAN (R 3.4.0)
## htmltools 0.3.6 2017-04-28 CRAN (R 3.4.0)
## httr 1.2.1 2016-07-03 CRAN (R 3.4.0)
## jsonlite 1.5 2017-06-01 cran (@1.5)
## knitr 1.16 2017-05-18 cran (@1.16)
## labeling 0.3 2014-08-23 CRAN (R 3.4.0)
## lattice 0.20-35 2017-03-25 CRAN (R 3.3.3)
## lazyeval 0.2.0 2016-06-12 CRAN (R 3.4.0)
## lme4 1.1-13 2017-04-19 CRAN (R 3.4.0)
## lubridate 1.6.0 2016-09-13 CRAN (R 3.4.0)
## magrittr 1.5 2014-11-22 CRAN (R 3.4.0)
## MASS 7.3-47 2017-04-21 CRAN (R 3.4.0)
## Matrix 1.2-11 2017-08-16 CRAN (R 3.4.1)
## MatrixModels 0.4-1 2015-08-22 CRAN (R 3.4.0)
## memoise 1.1.0 2017-04-21 CRAN (R 3.4.0)
## methods * 3.4.2 2017-09-29 local
## mgcv 1.8-18 2017-07-28 CRAN (R 3.4.1)
## minqa 1.2.4 2014-10-09 CRAN (R 3.4.0)
## mnormt 1.5-5 2016-10-15 CRAN (R 3.4.0)
## modelr 0.1.1 2017-07-24 CRAN (R 3.4.1)
## munsell 0.4.3 2016-02-13 CRAN (R 3.4.0)
## nlme 3.1-131 2017-02-06 CRAN (R 3.4.0)
## nloptr 1.0.4 2014-08-04 CRAN (R 3.4.0)
## nnet 7.3-12 2016-02-02 CRAN (R 3.4.0)
## parallel 3.4.2 2017-09-29 local
## pbkrtest 0.4-7 2017-03-15 CRAN (R 3.4.0)
## PerformanceAnalytics 1.4.3541 2014-09-16 CRAN (R 3.4.0)
## pkgconfig 2.0.1 2017-03-21 cran (@2.0.1)
## plyr 1.8.4 2016-06-08 CRAN (R 3.4.0)
## psych 1.7.5 2017-05-03 CRAN (R 3.4.0)
## purrr * 0.2.3 2017-08-02 cran (@0.2.3)
## quantreg 5.33 2017-04-18 CRAN (R 3.4.0)
## R6 2.2.2 2017-06-17 CRAN (R 3.4.1)
## Rcpp 0.12.13 2017-09-28 cran (@0.12.13)
## readr * 1.1.1 2017-05-16 CRAN (R 3.4.1)
## readxl 1.0.0 2017-04-18 CRAN (R 3.4.0)
## reshape2 1.4.2 2016-10-22 CRAN (R 3.4.0)
## rlang 0.1.2 2017-08-09 cran (@0.1.2)
## rmarkdown 1.6 2017-06-15 CRAN (R 3.4.1)
## Rmisc 1.5 2013-10-22 CRAN (R 3.4.0)
## rprojroot 1.2 2017-01-16 CRAN (R 3.4.0)
## rvest 0.3.2 2016-06-17 CRAN (R 3.4.0)
## scales 0.5.0.9000 2017-10-06 Github (hadley/scales@d767915)
## SparseM 1.77 2017-04-23 CRAN (R 3.4.0)
## splines 3.4.2 2017-09-29 local
## stats * 3.4.2 2017-09-29 local
## stringi 1.1.5 2017-04-07 CRAN (R 3.4.0)
## stringr 1.2.0 2017-02-18 CRAN (R 3.4.0)
## tibble * 1.3.4 2017-08-22 cran (@1.3.4)
## tidyr * 0.6.3 2017-05-15 CRAN (R 3.4.1)
## tidyverse * 1.1.1 2017-01-27 CRAN (R 3.4.0)
## tools 3.4.2 2017-09-29 local
## utils * 3.4.2 2017-09-29 local
## withr 2.0.0 2017-10-24 Github (jimhester/withr@a43df66)
## xml2 1.1.1 2017-01-24 CRAN (R 3.4.0)
## xts 0.10-0 2017-07-07 CRAN (R 3.4.1)
## yaml 2.1.14 2016-11-12 CRAN (R 3.4.0)
## zoo 1.8-0 2017-04-12 CRAN (R 3.4.0)
Fox, John, and Sanford Weisberg. 2017. Car: Companion to Applied Regression. https://CRAN.R-project.org/package=car.
Robinson, David. 2017. Broom: Convert Statistical Analysis Objects into Tidy Data Frames. https://CRAN.R-project.org/package=broom.
Slowikowski, Kamil. 2017. Ggrepel: Repulsive Text and Label Geoms for ’Ggplot2’. http://github.com/slowkow/ggrepel.
Wickham, Hadley. 2017. Tidyverse: Easily Install and Load ’Tidyverse’ Packages. https://CRAN.R-project.org/package=tidyverse.
Wickham, Hadley, and Winston Chang. 2017a. Devtools: Tools to Make Developing R Packages Easier. https://CRAN.R-project.org/package=devtools.
———. 2017b. Ggplot2: Create Elegant Data Visualisations Using the Grammar of Graphics.
Wickham, Hadley, Jim Hester, and Romain Francois. 2017. Readr: Read Rectangular Text Data. https://CRAN.R-project.org/package=readr.
Xie, Yihui. 2017. Knitr: A General-Purpose Package for Dynamic Report Generation in R. https://CRAN.R-project.org/package=knitr.