Question 1
require (RCurl)
## Loading required package: RCurl
## Loading required package: bitops
income <- read.csv(text=getURL("https://raw.githubusercontent.com/TheFedExpress/Data/master/Males%20Income.csv"),
header=TRUE, sep=",")
head(income)
## X nr year school exper union ethn married health wage
## 1 1 13 1980 14 1 no other no no 1.197540
## 2 2 13 1981 14 2 yes other no no 1.853060
## 3 3 13 1982 14 3 no other no no 1.344462
## 4 4 13 1983 14 4 no other no no 1.433213
## 5 5 13 1984 14 5 no other no no 1.568125
## 6 6 13 1985 14 6 no other no no 1.699891
## industry occupation
## 1 Business_and_Repair_Service Service_Workers
## 2 Personal_Service Service_Workers
## 3 Business_and_Repair_Service Service_Workers
## 4 Business_and_Repair_Service Service_Workers
## 5 Personal_Service Craftsmen, Foremen_and_kindred
## 6 Business_and_Repair_Service Managers, Officials_and_Proprietors
## residence
## 1 north_east
## 2 north_east
## 3 north_east
## 4 north_east
## 5 north_east
## 6 north_east
summary(income)
## X nr year school
## Min. : 1 Min. : 13 Min. :1980 Min. : 3.00
## 1st Qu.:1091 1st Qu.: 2329 1st Qu.:1982 1st Qu.:11.00
## Median :2180 Median : 4569 Median :1984 Median :12.00
## Mean :2180 Mean : 5262 Mean :1984 Mean :11.77
## 3rd Qu.:3270 3rd Qu.: 8406 3rd Qu.:1985 3rd Qu.:12.00
## Max. :4360 Max. :12548 Max. :1987 Max. :16.00
##
## exper union ethn married health
## Min. : 0.000 no :3296 black: 504 no :2446 no :4286
## 1st Qu.: 4.000 yes:1064 hisp : 680 yes:1914 yes: 74
## Median : 6.000 other:3176
## Mean : 6.515
## 3rd Qu.: 9.000
## Max. :18.000
##
## wage industry
## Min. :-3.579 Manufacturing :1231
## 1st Qu.: 1.351 Trade :1169
## Median : 1.671 Professional_and_Related Service: 333
## Mean : 1.649 Business_and_Repair_Service : 331
## 3rd Qu.: 1.991 Construction : 327
## Max. : 4.052 Transportation : 286
## (Other) : 683
## occupation residence
## Craftsmen, Foremen_and_kindred :934 north_east : 733
## Operatives_and_kindred :881 nothern_central: 964
## Service_Workers :509 rural_area : 85
## Clerical_and_kindred :486 south :1333
## Professional, Technical_and_kindred:453 NA's :1245
## Laborers_and_farmers :401
## (Other) :696
mean_wage <- mean(income[,"wage"])
mean_exp <- mean(income[, 'exper'])
med_wage <- median(income[, 'wage'])
med_exp <- median(income[,'exper'])
mean_wage
## [1] 1.649147
mean_exp
## [1] 6.514679
med_wage
## [1] 1.671143
med_exp
## [1] 6
Question 2
income_subset <- income[income[,'year'] >1982, c(1:6,10)]
head(income_subset)
## X nr year school exper union wage
## 4 4 13 1983 14 4 no 1.4332133
## 5 5 13 1984 14 5 no 1.5681251
## 6 6 13 1985 14 6 no 1.6998909
## 7 7 13 1986 14 7 no -0.7202626
## 8 8 13 1987 14 8 no 1.6691879
## 12 12 17 1983 13 7 no 1.7254101
income_subset[income_subset$year <= 1982]
## data frame with 0 columns and 2725 rows
Looks like the subsetting worked
class(income_subset)
## [1] "data.frame"
And the class is correct.
Question 3
colnames(income_subset) <- c('rownum', 'id', 'year_of_work', 'education_yrs', 'yrs_experience', 'is_union', 'hrly_wage')
head(income_subset)
## rownum id year_of_work education_yrs yrs_experience is_union hrly_wage
## 4 4 13 1983 14 4 no 1.4332133
## 5 5 13 1984 14 5 no 1.5681251
## 6 6 13 1985 14 6 no 1.6998909
## 7 7 13 1986 14 7 no -0.7202626
## 8 8 13 1987 14 8 no 1.6691879
## 12 12 17 1983 13 7 no 1.7254101
Question 4
summary(income_subset)
## rownum id year_of_work education_yrs
## Min. : 4 Min. : 13 Min. :1983 Min. : 3.00
## 1st Qu.:1093 1st Qu.: 2329 1st Qu.:1984 1st Qu.:11.00
## Median :2182 Median : 4569 Median :1985 Median :12.00
## Mean :2182 Mean : 5262 Mean :1985 Mean :11.77
## 3rd Qu.:3271 3rd Qu.: 8406 3rd Qu.:1986 3rd Qu.:12.00
## Max. :4360 Max. :12548 Max. :1987 Max. :16.00
## yrs_experience is_union hrly_wage
## Min. : 3.000 no :2074 Min. :-3.579
## 1st Qu.: 6.000 yes: 651 1st Qu.: 1.442
## Median : 8.000 Median : 1.766
## Mean : 8.015 Mean : 1.743
## 3rd Qu.: 9.000 3rd Qu.: 2.069
## Max. :18.000 Max. : 4.052
mean_wage_s <-mean(income_subset[,"hrly_wage"])
mean_exp_s <- mean(income_subset[, 'yrs_experience'])
med_wage_s <- median(income_subset[, 'hrly_wage'])
med_exp_s <- median(income_subset[,'yrs_experience'])
mean_wage_s
## [1] 1.743033
mean_exp_s
## [1] 8.014679
med_wage_s
## [1] 1.765844
med_exp_s
## [1] 8
sums <- c(mean_wage, mean_exp, med_wage, med_exp)
sums_s <-c(mean_wage_s, mean_exp_s, med_wage_s, med_exp_s)
comparison <- data.frame( whole_dataset = sums, subset = sums_s)
rownames(comparison) = c('Mean Wage', 'Mean Experience', 'Median Wage', 'Median Experience')
comparison
## whole_dataset subset
## Mean Wage 1.649147 1.743033
## Mean Experience 6.514679 8.014679
## Median Wage 1.671143 1.765844
## Median Experience 6.000000 8.000000
It Appears that the dataset looks at the same indviduals every year because the mean summary stats for
experience are exactly 1.5 years greater for the subset.
let’s check:
small <-income[income$nr < 500,]
table (small$nr)
##
## 13 17 18 45 110 120 126 150 162 166 189 193 209 212 218 243 259 260
## 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
## 309 351 353 383 408 424 464 483
## 8 8 8 8 8 8 8 8
Looks like that is the case
Question 5
tenure_class <- function(col){
if (col < 2)
{
col = 'Under 2'
}else if (2 <= col && col < 5)
{
col = "2 to 4"
}else if (5<= col && col < 10)
{
col = "5 to 9"
}else
{
col = "10+"
}
}
income_subset$yrs_experience <- lapply(income_subset$yrs_experience, tenure_class)
Question 6
head(income_subset,20)
## rownum id year_of_work education_yrs yrs_experience is_union hrly_wage
## 4 4 13 1983 14 2 to 4 no 1.4332133
## 5 5 13 1984 14 5 to 9 no 1.5681251
## 6 6 13 1985 14 5 to 9 no 1.6998909
## 7 7 13 1986 14 5 to 9 no -0.7202626
## 8 8 13 1987 14 5 to 9 no 1.6691879
## 12 12 17 1983 13 5 to 9 no 1.7254101
## 13 13 17 1984 13 5 to 9 no 1.6220223
## 14 14 17 1985 13 5 to 9 no 1.6085883
## 15 15 17 1986 13 10+ no 1.5723854
## 16 16 17 1987 13 10+ no 1.8203339
## 20 20 18 1983 12 5 to 9 no 1.9982288
## 21 21 18 1984 12 5 to 9 no 2.1840142
## 22 22 18 1985 12 5 to 9 no 2.2666622
## 23 23 18 1986 12 10+ no 2.0699439
## 24 24 18 1987 12 10+ no 2.8731607
## 28 28 45 1983 12 5 to 9 no 1.7409143
## 29 29 45 1984 12 5 to 9 no 1.8232137
## 30 30 45 1985 12 5 to 9 no 1.9082729
## 31 31 45 1986 12 5 to 9 no 1.7424474
## 32 32 45 1987 12 5 to 9 no 2.1356889