Installing packages

#install.packages(c("knitr", "tidyverse","readxl","ggplot2", "XLConnect"))

Loading packages and dataset

library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.3.2     ✔ purrr   0.3.4
## ✔ tibble  3.0.3     ✔ dplyr   1.0.2
## ✔ tidyr   1.1.2     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.2
## Warning: package 'tibble' was built under R version 3.6.2
## Warning: package 'tidyr' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
## Warning: package 'dplyr' was built under R version 3.6.2
## ── Conflicts ───────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(readxl)
library(ggplot2) 

cpsaat03 <- read_excel("cpsaat03.xlsx", sheet = "ByAge") 

#View(cpsaat03)  

Summary statistics for each variable

#summary(cpsaat03)

#Output the statistics result in a table format 
sum <- cpsaat03 %>% select(-Age) %>% summary()
#class(sum)
sum <- as.data.frame(sum)
#colnames(sum)

sum <- sum %>% select(-Var1) %>%
            separate(Freq, c("Stat","Num"), sep = ":") %>% 
            pivot_wider(names_from = Stat, values_from = Num)

view(sum)

Rename Variables, create percentages

#s1
#cpsaat03$NLF <- cpsaat03$`Civilian labor force Not in Labor force`
#cpsaat03$NIP <- cpsaat03$`Civilian NI pop` 
#cpsaat03$CLF <- cpsaat03$`Civilian labor force Total`

#cpsaat03$CLE <- cpsaat03$`Civilian labor force employed total` 

#percentages
#cpsaat03$NLF_Percent <- cpsaat03$NLF/cpsaat03$NIP 
#cpsaat03$CLF_Percent <- cpsaat03$CLF/cpsaat03$NIP

#s2: use mutate
cpsaat03 <- cpsaat03 %>% mutate(NLF =`Civilian labor force Not in Labor force`,
                    NIP = `Civilian NI pop` ,
                    CLF = `Civilian labor force Total`,
                    CLE = `Civilian labor force employed total`, 
                    NLF_Percent = NLF/NIP*100,
                    CLF_Percent = CLF/NIP*100)
#Summary statistics for each variable (updated)
#summary(cpsaat03)

Bar Chart of Civilian Labor Force Total with labels by Age

bar <- ggplot(cpsaat03, aes(Age)) + 
            geom_bar(aes(weight = CLF)) +
            labs(y='CLF Total') +
    coord_flip() 
         
print(bar)

Scatter Plot of Age and Civilian Labor Force by Percent of Population

  1. With original colors
  2. With one new color
scatter1 <- ggplot(cpsaat03) +
            geom_point(aes(x=Age, y=CLF_Percent), size=10)
print(scatter1)

scatter2 <- ggplot(cpsaat03, aes(Age, CLF_Percent)) +
            geom_point(size =5, color = 'Red')
print(scatter2)

scatter3 <- ggplot(cpsaat03, aes(Age, CLF_Percent)) +
            geom_point(color = 'Purple')
print(scatter3)

Scatter Plot of Civilian Labor Force (Percent), Age, and Civilian Labor Force (Employed)

#s1
#create a variable coeff = the maximum value of CLE / the maximum value of CLF_Percent
coeff <- 200 
scatter4 <- ggplot(cpsaat03) + 
            geom_point(aes(Age, CLF_Percent))  + 
            geom_point(aes(Age, CLE/coeff ),color ='Red') + 
            scale_y_continuous(name = 'CLF_Percent', sec.axis = sec_axis(~.*coeff, name = 'CLE'))
print(scatter4)

#s2
scatter4.1 <- ggplot(cpsaat03) + 
            geom_point(aes(Age, CLF_Percent, size= CLE))
print(scatter4.1)

Scatter Plot of Percent of Population Not in Labor Force by Age with different color dots for Ages with labor force non-participation over 50%

#use 3 different methods to plot
#s1
scatter5 <- ggplot(cpsaat03) +
            geom_point(aes(x=Age, y=NLF_Percent), color = ifelse( cpsaat03$NLF_Percent>50, 'Green','Black')) 
print(scatter5)

#s2
scatter5 <- ggplot(cpsaat03) +
            geom_point(aes(x=Age, y=NLF_Percent, color=NLF_Percent >50))+
            scale_colour_manual(name = 'Over 50%', values = setNames(c('green','black'),c(T, F)))
print(scatter5)

#s3:set a new column "Color" based on the ifelse statement and then use scale_color_identity to apply those color on the graph; without scale_color_identity, it won't take the value("green", "black") as the real color. (try to remove it and see) 
tt<- cpsaat03
tt %>% mutate(Color = ifelse(NLF_Percent > 50, "green", "black")) %>%
            ggplot(aes(x=Age, y=NLF_Percent, color = Color))+
            geom_point()+
           scale_color_identity()

#s4: another way to create new column "Color" 
tt$Color = cut(tt$NLF_Percent, breaks=c(-Inf, 49.9, Inf), labels=c("black", "green"))

ggplot(tt, aes(x=Age, y=NLF_Percent, color = Color))+
            geom_point()+
          scale_color_identity()