Homework #3: Subsetting Variables and Basic Group Comparison

ANSWER KEY

load(url("http://www.soc.duke.edu/~dee4/soc333data/gssHW2.data"))

Exercise 1: Write expressions that calculate the following:

# Multiple each of these numbers by three: 100,2,33,4,10
c(100, 2, 33, 4, 10) * 3

## [1] 300   6  99  12  30

# Create a sequence of numbers, 20 numbers long that starts at zero and
# counts up by 1
seq(0, 19, 1)  #OR

##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19

0:19

##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19

# Adds together these vectors: 8,10,22,44 and 9,10,54,34
c(8, 10, 22, 44) + c(9, 10, 54, 34)

## [1] 17 20 76 78

Exercise 2: Create a data frame of just the people who report less than 300 male or female sex partners. Do the following with this group:

# OK, you could have interpreteted what I was asking for a little
# differently.  What I wanted was to get rid of anyone who reported more
# than 300 partners of either sex
gss1 = subset(gss, subset = gss$nummen < 300 & gss$numwomen < 300)
# Create a table of both nummen and numwomen by sex
table(gss1$nummen, gss1$sex)

##      
##       male female
##   0   4473    401
##   1     62   1750
##   2     36    789
##   3     29    677
##   4     22    457
##   5     11    459
##   6     13    262
##   7      5    142
##   8      2    126
##   9      5     43
##   10    20    319
##   11     1     19
##   12     6     67
##   13     0     12
##   14     0     14
##   15     6    107
##   16     0     13
##   17     0      6
##   18     4     14
##   19     0      1
##   20    23     99
##   21     1      6
##   22     3      6
##   23     0      4
##   24     0      4
##   25     6     38
##   26     0      4
##   27     0      3
##   28     0      1
##   30     8     37
##   32     1      0
##   34     0      1
##   35     2     10
##   36     0      1
##   37     0      2
##   40     3     10
##   41     1      0
##   42     0      1
##   45     2      5
##   48     0      1
##   50    13     19
##   52     0      1
##   54     0      1
##   60     0      4
##   65     0      1
##   70     1      2
##   75     1      1
##   80     1      1
##   90     1      0
##   99     0      1
##   100   10      9
##   110    1      2
##   122    0      2
##   150    1      1
##   200    3      1

table(gss1$numwomen, gss1$sex)

##      
##       male female
##   0    338   5602
##   1    724    145
##   2    358     75
##   3    371     46
##   4    307     17
##   5    342     19
##   6    248     10
##   7    128     11
##   8    137      2
##   9     46      0
##   10   381      6
##   11    17      2
##   12   132      2
##   13    20      1
##   14    21      0
##   15   164      5
##   16    20      0
##   17    11      1
##   18    30      1
##   19     5      0
##   20   276      2
##   21    13      0
##   22    12      1
##   23     7      0
##   24    12      0
##   25   107      1
##   27     3      0
##   28     6      0
##   29     2      0
##   30   114      4
##   31     2      0
##   32     7      1
##   33     4      0
##   34     2      0
##   35    33      0
##   36     3      0
##   37     1      0
##   39     1      0
##   40    52      0
##   42     1      0
##   45    10      0
##   48     1      0
##   49     2      0
##   50   104      0
##   51     2      0
##   52     2      0
##   53     1      0
##   54     1      0
##   55     1      0
##   56     1      0
##   58     1      0
##   59     1      0
##   60    18      0
##   62     1      0
##   63     1      0
##   65     4      0
##   70     7      0
##   73     1      0
##   74     1      0
##   75    11      0
##   77     1      0
##   80     7      1
##   85     1      0
##   90     3      0
##   100   87      2
##   101    3      0
##   103    1      0
##   120    3      0
##   121    1      0
##   137    1      0
##   138    1      0
##   147    1      0
##   150   12      0
##   167    1      0
##   170    1      0
##   175    2      0
##   200   16      0
##   201    1      0
##   240    1      0
##   250    4      0

# What number of women report at least 1 female sex partner? Use logical
# expression and a sum().
sum(gss1$numwomen[gss1$sex == "female"] >= 1)

## [1] 355

# could also use:
length(gss1$numwomen[gss1$sex == "female" & gss1$numwomen >= 1])

## [1] 355

# What number of men?
sum(gss1$numwomen[gss1$sex == "male"] >= 1)

## [1] 4439

# Calculate the mean, the range, and standard deviation of the number of
# same sex partners for men and for women who report at least 1 same-sex
# partner. Female same sex
gssff = subset(gss1, subset = gss1$sex == "female" & gss1$numwomen >= 1)
## Male same sex
gssmm = subset(gss1, subset = gss1$sex == "male" & gss1$nummen >= 1)
# Calc Statistics
mean(gssff$numwomen)

## [1] 4.217

range(gssff$numwomen)  #gives the min and max

## [1]   1 100

max(gssff$numwomen) - min(gssff$numwomen)

## [1] 99

sd(gssff$numwomen)

## [1] 9.551

mean(gssmm$nummen)

## [1] 16.77

range(gssmm$nummen)  #gives the min and max

## [1]   1 200

max(gssmm$nummen) - min(gssmm$nummen)

## [1] 199

sd(gssmm$nummen)

## [1] 29.89

# Describe what is different between men and women in terms of the number of
# same-sex partners they have had. Males have more partners, and the
# distribution for men is much more heavily skewed

Exercise 3: Create two histograms: 1) the number of female partners for men and 2) the number of male partners for women. What patterns to you see for each group separately? What key differences between the groups?

gssm = subset(gss, subset = gss$sex == "male")
gssf = subset(gss, subset = gss$sex == "female")
hist(gssm$numwomen, breaks = 100)

plot of chunk unnamed-chunk-4

hist(gssf$nummen, breaks = 100)

plot of chunk unnamed-chunk-4

# men report more female partners, on average, than women; the maximum
# number for men is larger

Exercise 4: With the men, create a new variable that calculates the number of opposite sex partners divided by the number of years since the respondent's 18th birthday. Make a histogram that displays this statistic. Experiment with “breaks=” to make it look good. In your script file, keep the histogram with the best “breaks”.

newvar = gssm$numwomen/(gssm$age - 17)  #Depending on when you assume someone turned 18, you might also do age-18.
hist(newvar, breaks = 100)

plot of chunk unnamed-chunk-5

Exercise 5: The GSS asks if the respondent has had an extra marital affair. This is the variable evstray. What proportion of married men have “strayed”, what proportion of women? Show me the R commands to produce this proportion.

table(gssm$evstray)

## 
##           yes            no never married 
##           829          2599          1374

829/(829 + 2599)

## [1] 0.2418

# OR
sum(gssm$evstray == "yes")/sum(gssm$evstray == "yes" | gssm$evstray == "no")

## [1] 0.2418

# Females
table(gssf$evstray, gssf$marital)

##                
##                 married widowed divorced separated never married
##   yes               293      72      241        68             0
##   no               2519     543      690       168             0
##   never married       0       0        0         0          1364

674/(674 + 3920)

## [1] 0.1467

sum(gssf$evstray == "yes")/sum(gssf$evstray == "yes" | gssf$evstray == "no")

## [1] 0.1467

Exercise 6: Create a subset of the data restricted to only the “strayers.” If you were to predict the religious affiliation of “strayers” what would you guess? Now, figure out the proportion of “strayers” by religious tradition (in the variable reltrad). Make a barplot of the proportions to visualize these data. What is this plot telling us? What conclusions can we reach? What might lie behind this result?

gss2 = subset(gss, subset = gss$evstray == "yes")
# I would predict that they would be non-affiliated
barplot(prop.table(table(gss2$reltrad)), las = 2)

plot of chunk unnamed-chunk-7

# We can't directly sort out what is going on because we need to know how
# many married people there are of each religion. If we subsetted our data
# to just the married people and then look at the proportions of strayers by
# religion, we can tell what's going on.
gss3 = subset(gss, subset = gss$marital == "married")
prop.table(table(gss3$reltrad, gss3$evstray), 1)

##                   
##                       yes     no never married
##   evangelical      0.1308 0.8692        0.0000
##   mainline         0.1340 0.8660        0.0000
##   black protestant 0.1870 0.8130        0.0000
##   catholic         0.1253 0.8747        0.0000
##   jewish           0.1262 0.8738        0.0000
##   other faith      0.1581 0.8419        0.0000
##   nonaffiliated    0.1743 0.8257        0.0000