load(url("http://www.soc.duke.edu/~dee4/soc333data/gssHW2.data"))
Exercise 1: Write expressions that calculate the following:
# Multiple each of these numbers by three: 100,2,33,4,10
c(100, 2, 33, 4, 10) * 3
## [1] 300 6 99 12 30
# Create a sequence of numbers, 20 numbers long that starts at zero and
# counts up by 1
seq(0, 19, 1) #OR
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
0:19
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
# Adds together these vectors: 8,10,22,44 and 9,10,54,34
c(8, 10, 22, 44) + c(9, 10, 54, 34)
## [1] 17 20 76 78
Exercise 2: Create a data frame of just the people who report less than 300 male or female sex partners. Do the following with this group:
# OK, you could have interpreteted what I was asking for a little
# differently. What I wanted was to get rid of anyone who reported more
# than 300 partners of either sex
gss1 = subset(gss, subset = gss$nummen < 300 & gss$numwomen < 300)
# Create a table of both nummen and numwomen by sex
table(gss1$nummen, gss1$sex)
##
## male female
## 0 4473 401
## 1 62 1750
## 2 36 789
## 3 29 677
## 4 22 457
## 5 11 459
## 6 13 262
## 7 5 142
## 8 2 126
## 9 5 43
## 10 20 319
## 11 1 19
## 12 6 67
## 13 0 12
## 14 0 14
## 15 6 107
## 16 0 13
## 17 0 6
## 18 4 14
## 19 0 1
## 20 23 99
## 21 1 6
## 22 3 6
## 23 0 4
## 24 0 4
## 25 6 38
## 26 0 4
## 27 0 3
## 28 0 1
## 30 8 37
## 32 1 0
## 34 0 1
## 35 2 10
## 36 0 1
## 37 0 2
## 40 3 10
## 41 1 0
## 42 0 1
## 45 2 5
## 48 0 1
## 50 13 19
## 52 0 1
## 54 0 1
## 60 0 4
## 65 0 1
## 70 1 2
## 75 1 1
## 80 1 1
## 90 1 0
## 99 0 1
## 100 10 9
## 110 1 2
## 122 0 2
## 150 1 1
## 200 3 1
table(gss1$numwomen, gss1$sex)
##
## male female
## 0 338 5602
## 1 724 145
## 2 358 75
## 3 371 46
## 4 307 17
## 5 342 19
## 6 248 10
## 7 128 11
## 8 137 2
## 9 46 0
## 10 381 6
## 11 17 2
## 12 132 2
## 13 20 1
## 14 21 0
## 15 164 5
## 16 20 0
## 17 11 1
## 18 30 1
## 19 5 0
## 20 276 2
## 21 13 0
## 22 12 1
## 23 7 0
## 24 12 0
## 25 107 1
## 27 3 0
## 28 6 0
## 29 2 0
## 30 114 4
## 31 2 0
## 32 7 1
## 33 4 0
## 34 2 0
## 35 33 0
## 36 3 0
## 37 1 0
## 39 1 0
## 40 52 0
## 42 1 0
## 45 10 0
## 48 1 0
## 49 2 0
## 50 104 0
## 51 2 0
## 52 2 0
## 53 1 0
## 54 1 0
## 55 1 0
## 56 1 0
## 58 1 0
## 59 1 0
## 60 18 0
## 62 1 0
## 63 1 0
## 65 4 0
## 70 7 0
## 73 1 0
## 74 1 0
## 75 11 0
## 77 1 0
## 80 7 1
## 85 1 0
## 90 3 0
## 100 87 2
## 101 3 0
## 103 1 0
## 120 3 0
## 121 1 0
## 137 1 0
## 138 1 0
## 147 1 0
## 150 12 0
## 167 1 0
## 170 1 0
## 175 2 0
## 200 16 0
## 201 1 0
## 240 1 0
## 250 4 0
# What number of women report at least 1 female sex partner? Use logical
# expression and a sum().
sum(gss1$numwomen[gss1$sex == "female"] >= 1)
## [1] 355
# could also use:
length(gss1$numwomen[gss1$sex == "female" & gss1$numwomen >= 1])
## [1] 355
# What number of men?
sum(gss1$numwomen[gss1$sex == "male"] >= 1)
## [1] 4439
# Calculate the mean, the range, and standard deviation of the number of
# same sex partners for men and for women who report at least 1 same-sex
# partner. Female same sex
gssff = subset(gss1, subset = gss1$sex == "female" & gss1$numwomen >= 1)
## Male same sex
gssmm = subset(gss1, subset = gss1$sex == "male" & gss1$nummen >= 1)
# Calc Statistics
mean(gssff$numwomen)
## [1] 4.217
range(gssff$numwomen) #gives the min and max
## [1] 1 100
max(gssff$numwomen) - min(gssff$numwomen)
## [1] 99
sd(gssff$numwomen)
## [1] 9.551
mean(gssmm$nummen)
## [1] 16.77
range(gssmm$nummen) #gives the min and max
## [1] 1 200
max(gssmm$nummen) - min(gssmm$nummen)
## [1] 199
sd(gssmm$nummen)
## [1] 29.89
# Describe what is different between men and women in terms of the number of
# same-sex partners they have had. Males have more partners, and the
# distribution for men is much more heavily skewed
Exercise 3: Create two histograms: 1) the number of female partners for men and 2) the number of male partners for women. What patterns to you see for each group separately? What key differences between the groups?
gssm = subset(gss, subset = gss$sex == "male")
gssf = subset(gss, subset = gss$sex == "female")
hist(gssm$numwomen, breaks = 100)
hist(gssf$nummen, breaks = 100)
# men report more female partners, on average, than women; the maximum
# number for men is larger
Exercise 4: With the men, create a new variable that calculates the number of opposite sex partners divided by the number of years since the respondent's 18th birthday. Make a histogram that displays this statistic. Experiment with “breaks=” to make it look good. In your script file, keep the histogram with the best “breaks”.
newvar = gssm$numwomen/(gssm$age - 17) #Depending on when you assume someone turned 18, you might also do age-18.
hist(newvar, breaks = 100)
Exercise 5: The GSS asks if the respondent has had an extra marital affair. This is the variable evstray. What proportion of married men have “strayed”, what proportion of women? Show me the R commands to produce this proportion.
table(gssm$evstray)
##
## yes no never married
## 829 2599 1374
829/(829 + 2599)
## [1] 0.2418
# OR
sum(gssm$evstray == "yes")/sum(gssm$evstray == "yes" | gssm$evstray == "no")
## [1] 0.2418
# Females
table(gssf$evstray, gssf$marital)
##
## married widowed divorced separated never married
## yes 293 72 241 68 0
## no 2519 543 690 168 0
## never married 0 0 0 0 1364
674/(674 + 3920)
## [1] 0.1467
sum(gssf$evstray == "yes")/sum(gssf$evstray == "yes" | gssf$evstray == "no")
## [1] 0.1467
Exercise 6: Create a subset of the data restricted to only the “strayers.” If you were to predict the religious affiliation of “strayers” what would you guess? Now, figure out the proportion of “strayers” by religious tradition (in the variable reltrad). Make a barplot of the proportions to visualize these data. What is this plot telling us? What conclusions can we reach? What might lie behind this result?
gss2 = subset(gss, subset = gss$evstray == "yes")
# I would predict that they would be non-affiliated
barplot(prop.table(table(gss2$reltrad)), las = 2)
# We can't directly sort out what is going on because we need to know how
# many married people there are of each religion. If we subsetted our data
# to just the married people and then look at the proportions of strayers by
# religion, we can tell what's going on.
gss3 = subset(gss, subset = gss$marital == "married")
prop.table(table(gss3$reltrad, gss3$evstray), 1)
##
## yes no never married
## evangelical 0.1308 0.8692 0.0000
## mainline 0.1340 0.8660 0.0000
## black protestant 0.1870 0.8130 0.0000
## catholic 0.1253 0.8747 0.0000
## jewish 0.1262 0.8738 0.0000
## other faith 0.1581 0.8419 0.0000
## nonaffiliated 0.1743 0.8257 0.0000