Hi all! Here are my Workshop 3 codes.
Download GSS2012.csv here
Question 1:
gss <- read.csv("GSS2012.csv")
str(gss)
## 'data.frame': 4820 obs. of 9 variables:
## $ marital: int 5 5 1 1 4 2 1 4 1 5 ...
## $ age : int 22 21 42 49 70 50 35 24 28 28 ...
## $ sex : int 1 1 1 2 2 2 2 2 2 2 ...
## $ race : int 1 1 3 1 2 1 1 3 2 1 ...
## $ hompop : int 3 5 4 3 1 2 4 4 2 1 ...
## $ happy : int 1 1 2 1 1 1 2 2 3 2 ...
## $ health : int 2 1 2 0 0 4 0 3 3 2 ...
## $ satjob : int 1 1 3 0 0 0 1 2 2 2 ...
## $ fincome: num 178712 178712 91920 107240 42130 ...
# 1a.
# recode all "0" to NA
gss[gss == 0] <- NA
# creates boolean for each column of whether each data is NA
boolean <- apply(gss, 2, is.na)
# sum the "TRUE" in each column
apply(boolean, 2, sum)
## marital age sex race hompop happy health satjob fincome
## 2 51 0 0 1 14 1672 1266 446
# percentage of NA
mean(is.na(gss)) # not required in question
## [1] 0.07957584
# 1b.
# from example
gss$sex <- factor(gss$sex, levels = c(1,2), labels = c("Male", "Female"))
# factor
gss$marital <- factor(gss$marital, levels = c(1,2,3,4,5),
labels = c("Married","Widowed",
"Divorced", "Separated", "Never Married"))
gss$race <- factor(gss$race, levels = c(1,2,3), labels = c("White", "Black", "Other"))
gss$happy <- factor(gss$happy, levels = c(1,2,3),
labels = c("Very happy", "Pretty happy", "Not too happy"))
gss$health <- factor(gss$health, levels = c(1,2,3,4),
labels = c("Excellent", "Good", "Fair", "Poor"))
gss$satjob <- factor(gss$satjob, levels = c(1,2,3,4),
labels = c("Very satified", "Moderately satisfied",
"A little dissatisfied", "Very dissatisfied"))
str(gss)
## 'data.frame': 4820 obs. of 9 variables:
## $ marital: Factor w/ 5 levels "Married","Widowed",..: 5 5 1 1 4 2 1 4 1 5 ...
## $ age : int 22 21 42 49 70 50 35 24 28 28 ...
## $ sex : Factor w/ 2 levels "Male","Female": 1 1 1 2 2 2 2 2 2 2 ...
## $ race : Factor w/ 3 levels "White","Black",..: 1 1 3 1 2 1 1 3 2 1 ...
## $ hompop : int 3 5 4 3 1 2 4 4 2 1 ...
## $ happy : Factor w/ 3 levels "Very happy","Pretty happy",..: 1 1 2 1 1 1 2 2 3 2 ...
## $ health : Factor w/ 4 levels "Excellent","Good",..: 2 1 2 NA NA 4 NA 3 3 2 ...
## $ satjob : Factor w/ 4 levels "Very satified",..: 1 1 3 NA NA NA 1 2 2 2 ...
## $ fincome: num 178712 178712 91920 107240 42130 ...
# 1c.
gss$pcincome <- gss$fincome / gss$hompop
str(gss)
## 'data.frame': 4820 obs. of 10 variables:
## $ marital : Factor w/ 5 levels "Married","Widowed",..: 5 5 1 1 4 2 1 4 1 5 ...
## $ age : int 22 21 42 49 70 50 35 24 28 28 ...
## $ sex : Factor w/ 2 levels "Male","Female": 1 1 1 2 2 2 2 2 2 2 ...
## $ race : Factor w/ 3 levels "White","Black",..: 1 1 3 1 2 1 1 3 2 1 ...
## $ hompop : int 3 5 4 3 1 2 4 4 2 1 ...
## $ happy : Factor w/ 3 levels "Very happy","Pretty happy",..: 1 1 2 1 1 1 2 2 3 2 ...
## $ health : Factor w/ 4 levels "Excellent","Good",..: 2 1 2 NA NA 4 NA 3 3 2 ...
## $ satjob : Factor w/ 4 levels "Very satified",..: 1 1 3 NA NA NA 1 2 2 2 ...
## $ fincome : num 178712 178712 91920 107240 42130 ...
## $ pcincome: num 59571 35742 22980 35747 42130 ...
Question 2
# 2a.
summary(gss)
## marital age sex race
## Married :2255 Min. :18.00 Male :2132 White:3700
## Widowed : 404 1st Qu.:35.00 Female:2688 Black: 722
## Divorced : 798 Median :49.00 Other: 398
## Separated : 159 Mean :49.59
## Never Married:1202 3rd Qu.:62.00
## NA's : 2 Max. :89.00
## NA's :51
## hompop happy health
## Min. : 1.000 Very happy :1391 Excellent: 794
## 1st Qu.: 1.000 Pretty happy :2756 Good :1475
## Median : 2.000 Not too happy: 659 Fair : 694
## Mean : 2.588 NA's : 14 Poor : 185
## 3rd Qu.: 3.000 NA's :1672
## Max. :10.000
## NA's :1
## satjob fincome pcincome
## Very satified :1761 Min. : 383 Min. : 54.71
## Moderately satisfied :1311 1st Qu.: 18193 1st Qu.: 8138.75
## A little dissatisfied: 340 Median : 34470 Median : 16277.50
## Very dissatisfied : 142 Mean : 49894 Mean : 23109.12
## NA's :1266 3rd Qu.: 63195 3rd Qu.: 30640.00
## Max. :178712 Max. :178712.46
## NA's :446 NA's :446
# 2b.
hist(gss$fincome)

# 2c.
plot(gss$happy, gss$fincome, xlab = "happy", ylab = "fincome", main = "fincome vs happy")

plot(gss$race, gss$fincome, xlab = "race", ylab = "fincome", main = "fincome vs race")

# 2d.
# group 1, 30 =< age <50
sub1gss <- subset(gss, gss$age <50 & gss$age >= 30)
# group 2, age <30 or age >= 50
sub2gss <- subset(gss, gss$age >= 50 | gss$age < 30)
par(mfcol = c(2,2))
plot(sub1gss$happy, sub1gss$fincome,
xlab = "happy", ylab = "fincome",
main = "fincome vs happy, 30 =< age <50", cex.axis = 0.8)
plot(sub1gss$race, sub1gss$fincome,
xlab = "race", ylab = "fincome",
main = "fincome vs race, 30 =< age <50", cex.axis = 0.8)
plot(sub2gss$happy, sub2gss$fincome,
xlab = "happy", ylab = "fincome",
main = "fincome vs happy, age <30 or age >= 50", cex.axis = 0.8)
plot(sub2gss$race, sub2gss$fincome,
xlab = "race", ylab = "fincome",
main = "fincome vs race, age <30 or age >= 50", cex.axis = 0.8)

par(mfcol = c(1,1))
# 2d.
gssNoNA <- na.omit(gss)
library(ggplot2)
g0 <- ggplot(gssNoNA, aes(x = age))
g0 + geom_bar(aes(fill = happy))

# 2f.
g0 + geom_bar(aes(fill = health))

# 2g.
g0 + geom_bar(aes(fill = satjob))
