library(dplyr)
§ 1.1 How many interviewees are in the dataset?
CPS = read.csv("data/CPSData.csv")
nrow(CPS)
[1] 131302
§ 1.2 Among the interviewees with a value reported for the Industry variable, what is the most common industry of employment? Please enter the name exactly how you see it.
sort(table(CPS$Industry)) %>% tail # Educational and health services
Construction Leisure and hospitality
4387 6364
Manufacturing Professional and business services
6791 7519
Trade Educational and health services
8933 15017
§ 1.3 Which state has the fewest interviewees?
head(sort(table(CPS$State))) # New Mexico
New Mexico Montana Mississippi Alabama West Virginia
1102 1214 1230 1376 1409
Arkansas
1421
Which state has the largest number of interviewees?
tail(sort(table(CPS$State))) # California
Illinois Pennsylvania Florida New York Texas California
3912 3930 5149 5595 7077 11570
§ 1.4 What proportion of interviewees are citizens of the United States?
table(CPS$Citizenship) %>% prop.table
Citizen, Native Citizen, Naturalized Non-Citizen
0.888326 0.053868 0.057806
table(CPS$Citizenship == "Non-Citizen") %>% prop.table # 0.942194
FALSE TRUE
0.942194 0.057806
§ 1.5 For which races are there at least 250 interviewees in the CPS dataset of Hispanic ethnicity? (Select all that apply.)
table(CPS$Race, CPS$Hispanic)
0 1
American Indian 1129 304
Asian 6407 113
Black 13292 621
Multiracial 2449 448
Pacific Islander 541 77
White 89190 16731
§ 2.1 Which variables have at least one interviewee with a missing (NA) value? (Select all that apply.)
colSums(is.na(CPS))
PeopleInHousehold Region State MetroAreaCode
0 0 0 34238
Age Married Sex Education
0 25338 0 25338
Race Hispanic CountryOfBirthCode Citizenship
0 0 0 0
EmploymentStatus Industry
25789 65060
§ 2.2 Which is the most accurate:
lapply(CPS[c('Region','Sex','Age','Citizenship')],
function(x) table(is.na(CPS$Married), x))
$`Region`
x
Midwest Northeast South West
FALSE 24609 21432 33535 26388
TRUE 6075 4507 7967 6789
$Sex
x
Female Male
FALSE 55264 50700
TRUE 12217 13121
$Age
x
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
FALSE 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
TRUE 1283 1559 1574 1693 1695 1795 1721 1681 1729 1748 1750 1721 1797 1802 1790
x
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
FALSE 1795 1751 1764 1596 1517 1398 1525 1536 1638 1627 1604 1643 1657 1736 1645
TRUE 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
x
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
FALSE 1854 1762 1790 1804 1653 1716 1663 1531 1530 1542 1571 1673 1711 1819 1764
TRUE 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
x
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
FALSE 1749 1665 1647 1791 1989 1966 1931 1935 1994 1912 1895 1935 1827 1874 1758
TRUE 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
x
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
FALSE 1746 1735 1595 1596 1519 1569 1577 1227 1130 1062 1195 1031 941 896 842
TRUE 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
x
75 76 77 78 79 80 85
FALSE 763 729 698 659 661 2664 2446
TRUE 0 0 0 0 0 0 0
$Citizenship
x
Citizen, Native Citizen, Naturalized Non-Citizen
FALSE 91956 6910 7098
TRUE 24683 163 492
§ 2.3 How many states had all interviewees living in a non-metropolitan area (aka they have a missing MetroAreaCode value)? For this question, treat the District of Columbia as a state (even though it is not technically a state).
table(is.na(CPS$MetroAreaCode), CPS$State) # 2
Alabama Alaska Arizona Arkansas California Colorado Connecticut Delaware
FALSE 1020 0 1327 724 11333 2545 2593 1696
TRUE 356 1590 201 697 237 380 243 518
District of Columbia Florida Georgia Hawaii Idaho Illinois Indiana Iowa
FALSE 1791 4947 2250 1576 761 3473 1420 1297
TRUE 0 202 557 523 757 439 584 1231
Kansas Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
FALSE 1234 908 1216 909 2978 1858 2517 2150
TRUE 701 933 234 1354 222 129 546 989
Mississippi Missouri Montana Nebraska Nevada New Hampshire New Jersey
FALSE 376 1440 199 816 1609 1148 2567
TRUE 854 705 1015 1133 247 1514 0
New Mexico New York North Carolina North Dakota Ohio Oklahoma Oregon
FALSE 832 5144 1642 432 2754 1024 1519
TRUE 270 451 977 1213 924 499 424
Pennsylvania Rhode Island South Carolina South Dakota Tennessee Texas
FALSE 3245 2209 1139 595 1149 6060
TRUE 685 0 519 1405 635 1017
Utah Vermont Virginia Washington West Virginia Wisconsin Wyoming
FALSE 1455 657 2367 1937 344 1882 0
TRUE 387 1233 586 429 1065 804 1624
How many states had all interviewees living in a metropolitan area? Again, treat the District of Columbia as a state.
# 3
§ 2.4 Which region of the United States has the largest proportion of interviewees living in a non-metropolitan area?
tapply(is.na(CPS$MetroAreaCode), CPS$Region, mean) %>% sort
Northeast South West Midwest
0.21624 0.23784 0.24366 0.34787
§ 2.5 Which state has a proportion of interviewees living in a non-metropolitan area closest to 30%?
abs(0.3 - tapply(is.na(CPS$MetroAreaCode), CPS$State, mean)) %>%
sort %>% head # Wisconsin
Wisconsin Indiana South Carolina Minnesota Oklahoma
0.00067014 0.00858283 0.01302774 0.01506849 0.02764281
Missouri
0.02867133
Which state has the largest proportion of non-metropolitan interviewees, ignoring states where all interviewees were non-metropolitan?
tapply(is.na(CPS$MetroAreaCode), CPS$State, mean) %>%
sort %>% tail # Montana
South Dakota North Dakota West Virginia Montana Alaska
0.70250 0.73739 0.75586 0.83608 1.00000
Wyoming
1.00000
§ 3.1 How many observations (codes for metropolitan areas) are there in MetroAreaMap?
Metro = read.csv("data/MetroAreaCodes.csv")
nrow(Metro)
[1] 271
How many observations (codes for countries) are there in CountryMap?
Country = read.csv("data/CountryCodes.csv")
nrow(Country)
[1] 149
§ 3.2 What is the name of the variable that was added to the data frame by the merge() operation?
CPS = merge(CPS, Metro, by.x="MetroAreaCode", by.y="Code", all.x=TRUE)
# MetroArea
How many interviewees have a missing value for the new metropolitan area variable?
sum(is.na(CPS$MetroArea))
[1] 34238
§ 3.3 Which of the following metropolitan areas has the largest number of interviewees?
table(CPS$MetroArea) %>% sort %>% tail(10) # Boston-Cambridge-Quincy
Houston-Baytown-Sugar Land, TX
1649
Dallas-Fort Worth-Arlington, TX
1863
Minneapolis-St Paul-Bloomington, MN-WI
1942
Boston-Cambridge-Quincy, MA-NH
2229
Providence-Fall River-Warwick, MA-RI
2284
Chicago-Naperville-Joliet, IN-IN-WI
2772
Philadelphia-Camden-Wilmington, PA-NJ-DE
2855
Los Angeles-Long Beach-Santa Ana, CA
4102
Washington-Arlington-Alexandria, DC-VA-MD-WV
4177
New York-Northern New Jersey-Long Island, NY-NJ-PA
5409
§ 3.4 Which metropolitan area has the highest proportion of interviewees of Hispanic ethnicity?
tapply(CPS$Hispanic, CPS$MetroArea, mean) %>% sort %>% tail # Laredo, TX
San Antonio, TX El Centro, CA El Paso, TX
0.64415 0.68687 0.79098
Brownsville-Harlingen, TX McAllen-Edinburg-Pharr, TX Laredo, TX
0.79747 0.94872 0.96629
§ 3.5 Determine the number of metropolitan areas in the United States from which at least 20% of interviewees are Asian.
tapply(CPS$Race == "Asian", CPS$MetroArea, mean) %>% sort %>% tail # 4
Warner Robins, GA Fresno, CA
0.16667 0.18482
Vallejo-Fairfield, CA San Jose-Sunnyvale-Santa Clara, CA
0.20301 0.24179
San Francisco-Oakland-Fremont, CA Honolulu, HI
0.24675 0.50190
§ 3.6 Passing na.rm=TRUE to the tapply function, determine which metropolitan area has the smallest proportion of interviewees who have received no high school diploma.
tapply(CPS$Education == "No high school diploma", CPS$MetroArea, mean, na.rm=T) %>%
sort %>% head # Iowa City, IA
Iowa City, IA Bowling Green, KY Kalamazoo-Portage, MI
0.029126 0.037037 0.050505
Champaign-Urbana, IL Bremerton-Silverdale, WA Lawrence, KS
0.051546 0.054054 0.059524
§ 4.1 What is the name of the variable added to the CPS data frame by this merge operation?
CPS = merge(CPS, Country, by.x="CountryOfBirthCode", by.y="Code", all.x=TRUE)
# Country
How many interviewees have a missing value for the new metropolitan area variable?
sum(is.na(CPS$Country))
[1] 176
§ 4.2 Among all interviewees born outside of North America, which country was the most common place of birth?
table(CPS$Country) %>% sort %>% tail # Philippines
Puerto Rico China India Philippines Mexico
518 581 770 839 3921
United States
115063
§ 4.3 What proportion of the interviewees from the “New York-Northern New Jersey-Long Island, NY-NJ-PA” metropolitan area have a country of birth that is not the United States?
area = "New York-Northern New Jersey-Long Island, NY-NJ-PA"
mean(CPS$Country[CPS$MetroArea==area] != "United States", na.rm=T)
[1] 0.30866
§ 4.4 Which metropolitan area has the largest number (note – not proportion) of interviewees with a country of birth in India?
tapply(CPS$Country == 'India', CPS$MetroArea, sum, na.rm=T) %>%
sort %>% tail
San Francisco-Oakland-Fremont, CA
27
Detroit-Warren-Livonia, MI
30
Chicago-Naperville-Joliet, IN-IN-WI
31
Philadelphia-Camden-Wilmington, PA-NJ-DE
32
Washington-Arlington-Alexandria, DC-VA-MD-WV
50
New York-Northern New Jersey-Long Island, NY-NJ-PA
96
# New York-Northern New Jersey-Long Island, NY-NJ-PA
In Brazil?
tapply(CPS$Country == 'Brazil', CPS$MetroArea, sum, na.rm=T) %>% sort %>% tail
Bridgeport-Stamford-Norwalk, CT
7
New York-Northern New Jersey-Long Island, NY-NJ-PA
7
Washington-Arlington-Alexandria, DC-VA-MD-WV
8
Los Angeles-Long Beach-Santa Ana, CA
9
Miami-Fort Lauderdale-Miami Beach, FL
16
Boston-Cambridge-Quincy, MA-NH
18
# Boston-Cambridge-Quincy, MA-NH
In Somalia?
tapply(CPS$Country == 'Somalia', CPS$MetroArea, sum, na.rm=T) %>% sort %>% tail
Columbus, OH Fargo, ND-MN
5 5
Phoenix-Mesa-Scottsdale, AZ Seattle-Tacoma-Bellevue, WA
7 7
St. Cloud, MN Minneapolis-St Paul-Bloomington, MN-WI
7 17
# Minneapolis-St Paul-Bloomington, MN-WI