Installing packages
#install.packages(c("knitr", "tidyverse","readxl","ggplot2", "XLConnect"))
Loading packages and dataset
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.3.2 ✔ purrr 0.3.4
## ✔ tibble 3.0.3 ✔ dplyr 1.0.2
## ✔ tidyr 1.1.2 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.2
## Warning: package 'tibble' was built under R version 3.6.2
## Warning: package 'tidyr' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
## Warning: package 'dplyr' was built under R version 3.6.2
## ── Conflicts ───────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(readxl)
library(ggplot2)
cpsaat03 <- read_excel("cpsaat03.xlsx", sheet = "ByAge")
#View(cpsaat03)
Summary statistics for each variable
#summary(cpsaat03)
#Output the statistics result in a table format
sum <- cpsaat03 %>% select(-Age) %>% summary()
#class(sum)
sum <- as.data.frame(sum)
#colnames(sum)
sum <- sum %>% select(-Var1) %>%
separate(Freq, c("Stat","Num"), sep = ":") %>%
pivot_wider(names_from = Stat, values_from = Num)
view(sum)
Rename Variables, create percentages
#s1
#cpsaat03$NLF <- cpsaat03$`Civilian labor force Not in Labor force`
#cpsaat03$NIP <- cpsaat03$`Civilian NI pop`
#cpsaat03$CLF <- cpsaat03$`Civilian labor force Total`
#cpsaat03$CLE <- cpsaat03$`Civilian labor force employed total`
#percentages
#cpsaat03$NLF_Percent <- cpsaat03$NLF/cpsaat03$NIP
#cpsaat03$CLF_Percent <- cpsaat03$CLF/cpsaat03$NIP
#s2: use mutate
cpsaat03 <- cpsaat03 %>% mutate(NLF =`Civilian labor force Not in Labor force`,
NIP = `Civilian NI pop` ,
CLF = `Civilian labor force Total`,
CLE = `Civilian labor force employed total`,
NLF_Percent = NLF/NIP*100,
CLF_Percent = CLF/NIP*100)
#Summary statistics for each variable (updated)
#summary(cpsaat03)
Bar Chart of Civilian Labor Force Total with labels by Age
bar <- ggplot(cpsaat03, aes(Age)) +
geom_bar(aes(weight = CLF)) +
labs(y='CLF Total') +
coord_flip()
print(bar)

Scatter Plot of Age and Civilian Labor Force by Percent of Population
- With original colors
- With one new color
scatter1 <- ggplot(cpsaat03) +
geom_point(aes(x=Age, y=CLF_Percent), size=10)
print(scatter1)

scatter2 <- ggplot(cpsaat03, aes(Age, CLF_Percent)) +
geom_point(size =5, color = 'Red')
print(scatter2)

scatter3 <- ggplot(cpsaat03, aes(Age, CLF_Percent)) +
geom_point(color = 'Purple')
print(scatter3)

Scatter Plot of Civilian Labor Force (Percent), Age, and Civilian Labor Force (Employed)
#s1
#create a variable coeff = the maximum value of CLE / the maximum value of CLF_Percent
coeff <- 200
scatter4 <- ggplot(cpsaat03) +
geom_point(aes(Age, CLF_Percent)) +
geom_point(aes(Age, CLE/coeff ),color ='Red') +
scale_y_continuous(name = 'CLF_Percent', sec.axis = sec_axis(~.*coeff, name = 'CLE'))
print(scatter4)

#s2
scatter4.1 <- ggplot(cpsaat03) +
geom_point(aes(Age, CLF_Percent, size= CLE))
print(scatter4.1)

Scatter Plot of Percent of Population Not in Labor Force by Age with different color dots for Ages with labor force non-participation over 50%
#use 3 different methods to plot
#s1
scatter5 <- ggplot(cpsaat03) +
geom_point(aes(x=Age, y=NLF_Percent), color = ifelse( cpsaat03$NLF_Percent>50, 'Green','Black'))
print(scatter5)

#s2
scatter5 <- ggplot(cpsaat03) +
geom_point(aes(x=Age, y=NLF_Percent, color=NLF_Percent >50))+
scale_colour_manual(name = 'Over 50%', values = setNames(c('green','black'),c(T, F)))
print(scatter5)

#s3:set a new column "Color" based on the ifelse statement and then use scale_color_identity to apply those color on the graph; without scale_color_identity, it won't take the value("green", "black") as the real color. (try to remove it and see)
tt<- cpsaat03
tt %>% mutate(Color = ifelse(NLF_Percent > 50, "green", "black")) %>%
ggplot(aes(x=Age, y=NLF_Percent, color = Color))+
geom_point()+
scale_color_identity()

#s4: another way to create new column "Color"
tt$Color = cut(tt$NLF_Percent, breaks=c(-Inf, 49.9, Inf), labels=c("black", "green"))
ggplot(tt, aes(x=Age, y=NLF_Percent, color = Color))+
geom_point()+
scale_color_identity()
