Question 1

require (RCurl)
## Loading required package: RCurl
## Loading required package: bitops
income <- read.csv(text=getURL("https://raw.githubusercontent.com/TheFedExpress/Data/master/Males%20Income.csv"),
header=TRUE, sep=",")
head(income)
##   X nr year school exper union  ethn married health     wage
## 1 1 13 1980     14     1    no other      no     no 1.197540
## 2 2 13 1981     14     2   yes other      no     no 1.853060
## 3 3 13 1982     14     3    no other      no     no 1.344462
## 4 4 13 1983     14     4    no other      no     no 1.433213
## 5 5 13 1984     14     5    no other      no     no 1.568125
## 6 6 13 1985     14     6    no other      no     no 1.699891
##                      industry                          occupation
## 1 Business_and_Repair_Service                     Service_Workers
## 2            Personal_Service                     Service_Workers
## 3 Business_and_Repair_Service                     Service_Workers
## 4 Business_and_Repair_Service                     Service_Workers
## 5            Personal_Service      Craftsmen, Foremen_and_kindred
## 6 Business_and_Repair_Service Managers, Officials_and_Proprietors
##    residence
## 1 north_east
## 2 north_east
## 3 north_east
## 4 north_east
## 5 north_east
## 6 north_east
summary(income)
##        X              nr             year          school     
##  Min.   :   1   Min.   :   13   Min.   :1980   Min.   : 3.00  
##  1st Qu.:1091   1st Qu.: 2329   1st Qu.:1982   1st Qu.:11.00  
##  Median :2180   Median : 4569   Median :1984   Median :12.00  
##  Mean   :2180   Mean   : 5262   Mean   :1984   Mean   :11.77  
##  3rd Qu.:3270   3rd Qu.: 8406   3rd Qu.:1985   3rd Qu.:12.00  
##  Max.   :4360   Max.   :12548   Max.   :1987   Max.   :16.00  
##                                                               
##      exper        union         ethn      married    health    
##  Min.   : 0.000   no :3296   black: 504   no :2446   no :4286  
##  1st Qu.: 4.000   yes:1064   hisp : 680   yes:1914   yes:  74  
##  Median : 6.000              other:3176                        
##  Mean   : 6.515                                                
##  3rd Qu.: 9.000                                                
##  Max.   :18.000                                                
##                                                                
##       wage                                    industry   
##  Min.   :-3.579   Manufacturing                   :1231  
##  1st Qu.: 1.351   Trade                           :1169  
##  Median : 1.671   Professional_and_Related Service: 333  
##  Mean   : 1.649   Business_and_Repair_Service     : 331  
##  3rd Qu.: 1.991   Construction                    : 327  
##  Max.   : 4.052   Transportation                  : 286  
##                   (Other)                         : 683  
##                                occupation            residence   
##  Craftsmen, Foremen_and_kindred     :934   north_east     : 733  
##  Operatives_and_kindred             :881   nothern_central: 964  
##  Service_Workers                    :509   rural_area     :  85  
##  Clerical_and_kindred               :486   south          :1333  
##  Professional, Technical_and_kindred:453   NA's           :1245  
##  Laborers_and_farmers               :401                         
##  (Other)                            :696
mean_wage <- mean(income[,"wage"])
mean_exp <- mean(income[, 'exper'])
med_wage <- median(income[, 'wage'])
med_exp <- median(income[,'exper'])
mean_wage
## [1] 1.649147
mean_exp
## [1] 6.514679
med_wage
## [1] 1.671143
med_exp
## [1] 6

Question 2

income_subset <- income[income[,'year'] >1982, c(1:6,10)]
head(income_subset)
##     X nr year school exper union       wage
## 4   4 13 1983     14     4    no  1.4332133
## 5   5 13 1984     14     5    no  1.5681251
## 6   6 13 1985     14     6    no  1.6998909
## 7   7 13 1986     14     7    no -0.7202626
## 8   8 13 1987     14     8    no  1.6691879
## 12 12 17 1983     13     7    no  1.7254101
income_subset[income_subset$year <= 1982]
## data frame with 0 columns and 2725 rows

Looks like the subsetting worked

class(income_subset)
## [1] "data.frame"

And the class is correct.

Question 3

colnames(income_subset) <- c('rownum', 'id', 'year_of_work', 'education_yrs', 'yrs_experience', 'is_union', 'hrly_wage')
head(income_subset)
##    rownum id year_of_work education_yrs yrs_experience is_union  hrly_wage
## 4       4 13         1983            14              4       no  1.4332133
## 5       5 13         1984            14              5       no  1.5681251
## 6       6 13         1985            14              6       no  1.6998909
## 7       7 13         1986            14              7       no -0.7202626
## 8       8 13         1987            14              8       no  1.6691879
## 12     12 17         1983            13              7       no  1.7254101

Question 4

summary(income_subset)
##      rownum           id         year_of_work  education_yrs  
##  Min.   :   4   Min.   :   13   Min.   :1983   Min.   : 3.00  
##  1st Qu.:1093   1st Qu.: 2329   1st Qu.:1984   1st Qu.:11.00  
##  Median :2182   Median : 4569   Median :1985   Median :12.00  
##  Mean   :2182   Mean   : 5262   Mean   :1985   Mean   :11.77  
##  3rd Qu.:3271   3rd Qu.: 8406   3rd Qu.:1986   3rd Qu.:12.00  
##  Max.   :4360   Max.   :12548   Max.   :1987   Max.   :16.00  
##  yrs_experience   is_union     hrly_wage     
##  Min.   : 3.000   no :2074   Min.   :-3.579  
##  1st Qu.: 6.000   yes: 651   1st Qu.: 1.442  
##  Median : 8.000              Median : 1.766  
##  Mean   : 8.015              Mean   : 1.743  
##  3rd Qu.: 9.000              3rd Qu.: 2.069  
##  Max.   :18.000              Max.   : 4.052
mean_wage_s <-mean(income_subset[,"hrly_wage"])
mean_exp_s <- mean(income_subset[, 'yrs_experience'])
med_wage_s <- median(income_subset[, 'hrly_wage'])
med_exp_s <- median(income_subset[,'yrs_experience'])

mean_wage_s
## [1] 1.743033
mean_exp_s
## [1] 8.014679
med_wage_s
## [1] 1.765844
med_exp_s
## [1] 8
sums <- c(mean_wage, mean_exp, med_wage, med_exp)
sums_s <-c(mean_wage_s, mean_exp_s, med_wage_s, med_exp_s)
comparison <- data.frame( whole_dataset = sums, subset = sums_s)
rownames(comparison) = c('Mean Wage', 'Mean Experience', 'Median Wage', 'Median Experience')
comparison
##                   whole_dataset   subset
## Mean Wage              1.649147 1.743033
## Mean Experience        6.514679 8.014679
## Median Wage            1.671143 1.765844
## Median Experience      6.000000 8.000000

It Appears that the dataset looks at the same indviduals every year because the mean summary stats for

experience are exactly 1.5 years greater for the subset.

let’s check:

small <-income[income$nr < 500,]
table (small$nr)
## 
##  13  17  18  45 110 120 126 150 162 166 189 193 209 212 218 243 259 260 
##   8   8   8   8   8   8   8   8   8   8   8   8   8   8   8   8   8   8 
## 309 351 353 383 408 424 464 483 
##   8   8   8   8   8   8   8   8

Looks like that is the case

Question 5

tenure_class <- function(col){
  if (col < 2)
  {
    col = 'Under 2'
  }else if (2 <= col  && col < 5)
  {
    col = "2 to 4"
  }else if (5<= col &&  col < 10)
  {
    col = "5 to 9"
  }else
  {
    col = "10+"
  }
}

income_subset$yrs_experience <- lapply(income_subset$yrs_experience, tenure_class)

Question 6

head(income_subset,20)
##    rownum id year_of_work education_yrs yrs_experience is_union  hrly_wage
## 4       4 13         1983            14         2 to 4       no  1.4332133
## 5       5 13         1984            14         5 to 9       no  1.5681251
## 6       6 13         1985            14         5 to 9       no  1.6998909
## 7       7 13         1986            14         5 to 9       no -0.7202626
## 8       8 13         1987            14         5 to 9       no  1.6691879
## 12     12 17         1983            13         5 to 9       no  1.7254101
## 13     13 17         1984            13         5 to 9       no  1.6220223
## 14     14 17         1985            13         5 to 9       no  1.6085883
## 15     15 17         1986            13            10+       no  1.5723854
## 16     16 17         1987            13            10+       no  1.8203339
## 20     20 18         1983            12         5 to 9       no  1.9982288
## 21     21 18         1984            12         5 to 9       no  2.1840142
## 22     22 18         1985            12         5 to 9       no  2.2666622
## 23     23 18         1986            12            10+       no  2.0699439
## 24     24 18         1987            12            10+       no  2.8731607
## 28     28 45         1983            12         5 to 9       no  1.7409143
## 29     29 45         1984            12         5 to 9       no  1.8232137
## 30     30 45         1985            12         5 to 9       no  1.9082729
## 31     31 45         1986            12         5 to 9       no  1.7424474
## 32     32 45         1987            12         5 to 9       no  2.1356889