library(ISLR)
library(tidyverse)

data(Wage)

Data Summary Statistics

head(Wage)
##        year age           maritl     race       education
## 231655 2006  18 1. Never Married 1. White    1. < HS Grad
## 86582  2004  24 1. Never Married 1. White 4. College Grad
## 161300 2003  45       2. Married 1. White 3. Some College
## 155159 2003  43       2. Married 3. Asian 4. College Grad
## 11443  2005  50      4. Divorced 1. White      2. HS Grad
## 376662 2008  54       2. Married 1. White 4. College Grad
##                    region       jobclass         health health_ins
## 231655 2. Middle Atlantic  1. Industrial      1. <=Good      2. No
## 86582  2. Middle Atlantic 2. Information 2. >=Very Good      2. No
## 161300 2. Middle Atlantic  1. Industrial      1. <=Good     1. Yes
## 155159 2. Middle Atlantic 2. Information 2. >=Very Good     1. Yes
## 11443  2. Middle Atlantic 2. Information      1. <=Good     1. Yes
## 376662 2. Middle Atlantic 2. Information 2. >=Very Good     1. Yes
##         logwage      wage
## 231655 4.318063  75.04315
## 86582  4.255273  70.47602
## 161300 4.875061 130.98218
## 155159 5.041393 154.68529
## 11443  4.318063  75.04315
## 376662 4.845098 127.11574
summary(Wage)
##       year           age                     maritl           race     
##  Min.   :2003   Min.   :18.00   1. Never Married: 648   1. White:2480  
##  1st Qu.:2004   1st Qu.:33.75   2. Married      :2074   2. Black: 293  
##  Median :2006   Median :42.00   3. Widowed      :  19   3. Asian: 190  
##  Mean   :2006   Mean   :42.41   4. Divorced     : 204   4. Other:  37  
##  3rd Qu.:2008   3rd Qu.:51.00   5. Separated    :  55                  
##  Max.   :2009   Max.   :80.00                                          
##                                                                        
##               education                     region    
##  1. < HS Grad      :268   2. Middle Atlantic   :3000  
##  2. HS Grad        :971   1. New England       :   0  
##  3. Some College   :650   3. East North Central:   0  
##  4. College Grad   :685   4. West North Central:   0  
##  5. Advanced Degree:426   5. South Atlantic    :   0  
##                           6. East South Central:   0  
##                           (Other)              :   0  
##            jobclass               health      health_ins      logwage     
##  1. Industrial :1544   1. <=Good     : 858   1. Yes:2083   Min.   :3.000  
##  2. Information:1456   2. >=Very Good:2142   2. No : 917   1st Qu.:4.447  
##                                                            Median :4.653  
##                                                            Mean   :4.654  
##                                                            3rd Qu.:4.857  
##                                                            Max.   :5.763  
##                                                                           
##       wage       
##  Min.   : 20.09  
##  1st Qu.: 85.38  
##  Median :104.92  
##  Mean   :111.70  
##  3rd Qu.:128.68  
##  Max.   :318.34  
## 
str(Wage)
## 'data.frame':    3000 obs. of  11 variables:
##  $ year      : int  2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
##  $ age       : int  18 24 45 43 50 54 44 30 41 52 ...
##  $ maritl    : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
##  $ race      : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
##  $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
##  $ region    : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ jobclass  : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
##  $ health    : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
##  $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
##  $ logwage   : num  4.32 4.26 4.88 5.04 4.32 ...
##  $ wage      : num  75 70.5 131 154.7 75 ...

Scatter Plot with Linear Trend

 ggplot(Wage, aes(age, wage)) + geom_point() + geom_smooth(method ="lm", formula = y ~ x)

# Base R

plot(Wage$age, Wage$wage)
abline(lm(wage ~ age, data = Wage))

Density Plot

ggplot(Wage, aes(wage, fill = education, color = education)) + geom_density(alpha = 0.1)

Box and Whisker Plot

ggplot(Wage, aes(education, wage)) + geom_boxplot()

## Base R
par(mar = c(5, 10, 4, 2) + 0.1)
boxplot(wage ~ education, data = Wage, names = c("< HS Grad", "HS Grad", "Some College", "College Grad", "Advanced Grad"), horizontal = TRUE,las=1, 
         xlab = "Wage")

Histograms

ggplot(Wage, aes(wage)) + facet_wrap(~maritl) + geom_histogram() + ggtitle("Histograms of Wages by Marital Status") + theme(plot.title = element_text(hjust = 0.5)) + labs(caption = "data from package ISLR")

Stacked Bar Charts

data(College)

head(College)
##                              Private Apps Accept Enroll Top10perc
## Abilene Christian University     Yes 1660   1232    721        23
## Adelphi University               Yes 2186   1924    512        16
## Adrian College                   Yes 1428   1097    336        22
## Agnes Scott College              Yes  417    349    137        60
## Alaska Pacific University        Yes  193    146     55        16
## Albertson College                Yes  587    479    158        38
##                              Top25perc F.Undergrad P.Undergrad Outstate
## Abilene Christian University        52        2885         537     7440
## Adelphi University                  29        2683        1227    12280
## Adrian College                      50        1036          99    11250
## Agnes Scott College                 89         510          63    12960
## Alaska Pacific University           44         249         869     7560
## Albertson College                   62         678          41    13500
##                              Room.Board Books Personal PhD Terminal
## Abilene Christian University       3300   450     2200  70       78
## Adelphi University                 6450   750     1500  29       30
## Adrian College                     3750   400     1165  53       66
## Agnes Scott College                5450   450      875  92       97
## Alaska Pacific University          4120   800     1500  76       72
## Albertson College                  3335   500      675  67       73
##                              S.F.Ratio perc.alumni Expend Grad.Rate
## Abilene Christian University      18.1          12   7041        60
## Adelphi University                12.2          16  10527        56
## Adrian College                    12.9          30   8735        54
## Agnes Scott College                7.7          37  19016        59
## Alaska Pacific University         11.9           2  10922        15
## Albertson College                  9.4          11   9727        55
College_DF <-
  College %>%
  rownames_to_column("College")

Missouri_Publics <-
  College_DF %>%
  filter(College %in% c("Central Missouri State University", "Southwest Missouri State University", "University of Missouri at Columbia")) %>%
  select(College, Apps, Accept, Enroll) %>%
  mutate(College = case_when(College == "Central Missouri State University" ~ "UCM",
                             College == "Southwest Missouri State University" ~
                             "MSU",
                              College == "University of Missouri at Columbia" ~ "MU")) %>%
  gather(Enrollment_Cycle, Count, -College) %>%
  arrange(College)

ggplot(Missouri_Publics, aes(College, Count, fill = Enrollment_Cycle)) + geom_bar(stat = "identity") + geom_text(aes(label = Count), size = 3, hjust = 0.5, vjust = 3, position = "stack") + theme_bw() + ggtitle("Applied, Admitted and Enrolled at MSU, MU and UCM")+ theme(plot.title = element_text(hjust = 0.5)) + labs(caption ="Source: 1995 US News and World Report")

Line Plot

data(economics_long)

unemployment_df <-
  economics_long %>%
  filter(variable == "uempmed")

head(unemployment_df)
## # A tibble: 6 x 4
## # Groups:   variable [1]
##         date variable value    value01
##       <date>   <fctr> <dbl>      <dbl>
## 1 1967-07-01  uempmed   4.5 0.02358491
## 2 1967-08-01  uempmed   4.7 0.03301887
## 3 1967-09-01  uempmed   4.6 0.02830189
## 4 1967-10-01  uempmed   4.9 0.04245283
## 5 1967-11-01  uempmed   4.7 0.03301887
## 6 1967-12-01  uempmed   4.8 0.03773585
ggplot(unemployment_df, aes(date)) + geom_line(aes(y = value)) + theme_bw() + ylab("Number of Unemployed (thousands)") + xlab("Year")