library(ISLR)
library(tidyverse)
data(Wage)
Data Summary Statistics
head(Wage)
## year age maritl race education
## 231655 2006 18 1. Never Married 1. White 1. < HS Grad
## 86582 2004 24 1. Never Married 1. White 4. College Grad
## 161300 2003 45 2. Married 1. White 3. Some College
## 155159 2003 43 2. Married 3. Asian 4. College Grad
## 11443 2005 50 4. Divorced 1. White 2. HS Grad
## 376662 2008 54 2. Married 1. White 4. College Grad
## region jobclass health health_ins
## 231655 2. Middle Atlantic 1. Industrial 1. <=Good 2. No
## 86582 2. Middle Atlantic 2. Information 2. >=Very Good 2. No
## 161300 2. Middle Atlantic 1. Industrial 1. <=Good 1. Yes
## 155159 2. Middle Atlantic 2. Information 2. >=Very Good 1. Yes
## 11443 2. Middle Atlantic 2. Information 1. <=Good 1. Yes
## 376662 2. Middle Atlantic 2. Information 2. >=Very Good 1. Yes
## logwage wage
## 231655 4.318063 75.04315
## 86582 4.255273 70.47602
## 161300 4.875061 130.98218
## 155159 5.041393 154.68529
## 11443 4.318063 75.04315
## 376662 4.845098 127.11574
summary(Wage)
## year age maritl race
## Min. :2003 Min. :18.00 1. Never Married: 648 1. White:2480
## 1st Qu.:2004 1st Qu.:33.75 2. Married :2074 2. Black: 293
## Median :2006 Median :42.00 3. Widowed : 19 3. Asian: 190
## Mean :2006 Mean :42.41 4. Divorced : 204 4. Other: 37
## 3rd Qu.:2008 3rd Qu.:51.00 5. Separated : 55
## Max. :2009 Max. :80.00
##
## education region
## 1. < HS Grad :268 2. Middle Atlantic :3000
## 2. HS Grad :971 1. New England : 0
## 3. Some College :650 3. East North Central: 0
## 4. College Grad :685 4. West North Central: 0
## 5. Advanced Degree:426 5. South Atlantic : 0
## 6. East South Central: 0
## (Other) : 0
## jobclass health health_ins logwage
## 1. Industrial :1544 1. <=Good : 858 1. Yes:2083 Min. :3.000
## 2. Information:1456 2. >=Very Good:2142 2. No : 917 1st Qu.:4.447
## Median :4.653
## Mean :4.654
## 3rd Qu.:4.857
## Max. :5.763
##
## wage
## Min. : 20.09
## 1st Qu.: 85.38
## Median :104.92
## Mean :111.70
## 3rd Qu.:128.68
## Max. :318.34
##
str(Wage)
## 'data.frame': 3000 obs. of 11 variables:
## $ year : int 2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
## $ age : int 18 24 45 43 50 54 44 30 41 52 ...
## $ maritl : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
## $ race : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
## $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
## $ region : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ jobclass : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
## $ health : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
## $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
## $ logwage : num 4.32 4.26 4.88 5.04 4.32 ...
## $ wage : num 75 70.5 131 154.7 75 ...
Scatter Plot with Linear Trend
ggplot(Wage, aes(age, wage)) + geom_point() + geom_smooth(method ="lm", formula = y ~ x)

# Base R
plot(Wage$age, Wage$wage)
abline(lm(wage ~ age, data = Wage))

Density Plot
ggplot(Wage, aes(wage, fill = education, color = education)) + geom_density(alpha = 0.1)

Box and Whisker Plot
ggplot(Wage, aes(education, wage)) + geom_boxplot()

## Base R
par(mar = c(5, 10, 4, 2) + 0.1)
boxplot(wage ~ education, data = Wage, names = c("< HS Grad", "HS Grad", "Some College", "College Grad", "Advanced Grad"), horizontal = TRUE,las=1,
xlab = "Wage")

Histograms
ggplot(Wage, aes(wage)) + facet_wrap(~maritl) + geom_histogram() + ggtitle("Histograms of Wages by Marital Status") + theme(plot.title = element_text(hjust = 0.5)) + labs(caption = "data from package ISLR")

Stacked Bar Charts
data(College)
head(College)
## Private Apps Accept Enroll Top10perc
## Abilene Christian University Yes 1660 1232 721 23
## Adelphi University Yes 2186 1924 512 16
## Adrian College Yes 1428 1097 336 22
## Agnes Scott College Yes 417 349 137 60
## Alaska Pacific University Yes 193 146 55 16
## Albertson College Yes 587 479 158 38
## Top25perc F.Undergrad P.Undergrad Outstate
## Abilene Christian University 52 2885 537 7440
## Adelphi University 29 2683 1227 12280
## Adrian College 50 1036 99 11250
## Agnes Scott College 89 510 63 12960
## Alaska Pacific University 44 249 869 7560
## Albertson College 62 678 41 13500
## Room.Board Books Personal PhD Terminal
## Abilene Christian University 3300 450 2200 70 78
## Adelphi University 6450 750 1500 29 30
## Adrian College 3750 400 1165 53 66
## Agnes Scott College 5450 450 875 92 97
## Alaska Pacific University 4120 800 1500 76 72
## Albertson College 3335 500 675 67 73
## S.F.Ratio perc.alumni Expend Grad.Rate
## Abilene Christian University 18.1 12 7041 60
## Adelphi University 12.2 16 10527 56
## Adrian College 12.9 30 8735 54
## Agnes Scott College 7.7 37 19016 59
## Alaska Pacific University 11.9 2 10922 15
## Albertson College 9.4 11 9727 55
College_DF <-
College %>%
rownames_to_column("College")
Missouri_Publics <-
College_DF %>%
filter(College %in% c("Central Missouri State University", "Southwest Missouri State University", "University of Missouri at Columbia")) %>%
select(College, Apps, Accept, Enroll) %>%
mutate(College = case_when(College == "Central Missouri State University" ~ "UCM",
College == "Southwest Missouri State University" ~
"MSU",
College == "University of Missouri at Columbia" ~ "MU")) %>%
gather(Enrollment_Cycle, Count, -College) %>%
arrange(College)
ggplot(Missouri_Publics, aes(College, Count, fill = Enrollment_Cycle)) + geom_bar(stat = "identity") + geom_text(aes(label = Count), size = 3, hjust = 0.5, vjust = 3, position = "stack") + theme_bw() + ggtitle("Applied, Admitted and Enrolled at MSU, MU and UCM")+ theme(plot.title = element_text(hjust = 0.5)) + labs(caption ="Source: 1995 US News and World Report")

Line Plot
data(economics_long)
unemployment_df <-
economics_long %>%
filter(variable == "uempmed")
head(unemployment_df)
## # A tibble: 6 x 4
## # Groups: variable [1]
## date variable value value01
## <date> <fctr> <dbl> <dbl>
## 1 1967-07-01 uempmed 4.5 0.02358491
## 2 1967-08-01 uempmed 4.7 0.03301887
## 3 1967-09-01 uempmed 4.6 0.02830189
## 4 1967-10-01 uempmed 4.9 0.04245283
## 5 1967-11-01 uempmed 4.7 0.03301887
## 6 1967-12-01 uempmed 4.8 0.03773585
ggplot(unemployment_df, aes(date)) + geom_line(aes(y = value)) + theme_bw() + ylab("Number of Unemployed (thousands)") + xlab("Year")
