EP763_JSK

Author

Joy Emmanuel

Part 1 — Using inc_real.sav

When importing the SPSS file, you may see a warning such as “re-encoding from CP1252.” This is harmless and can be ignored, as suggested by the assignment instructions.

Q1. Read the SPSS file and inspect the data

# Q1-a: print the first 10 rows of the data frame
inc_dat <- read.spss("inc_real.sav", to.data.frame = TRUE)
head(inc_dat, 10)  # or inc_dat[1:10, ]

   age    sex whours                             educat income   hwage edu
1   24   male     40 non-tertiary post-secondary degree  18000 112.500  15
2   43 female     40               academic high school  14500  90.625  12
3   27   male     40                     apprenticeship  18000 112.500  10
4   37   male     40                  compulsory school  15700  98.125   9
5   50   male     42               academic high school  38000 237.500  12
6   50   male     39                     apprenticeship  22000 137.500  10
7   30 female     40     special vocational high school   5200  32.500  13
8   60   male     39                     apprenticeship  12000  75.000  10
9   45   male     40                     apprenticeship  15000  93.750  10
10  26   male     39                     apprenticeship  13000  81.250  10
   potexp
1       3
2      25
3      11
4      22
5      32
6      34
7      11
8      44
9      29
10     10

# Q1-b: use dim() to determine how many rows and columns the data frame has
dim(inc_dat)  # nrow(inc_dat); ncol(inc_dat)

[1] 1271    8

# Q1-c: get the variable names (use names())
names(inc_dat)

[1] "age"    "sex"    "whours" "educat" "income" "hwage"  "edu"    "potexp"

# Q1-d: determine the types of variables (numeric, factor, etc.)
str(inc_dat)

'data.frame':   1271 obs. of  8 variables:
 $ age   : num  24 43 27 37 50 50 30 60 45 26 ...
 $ sex   : Factor w/ 2 levels "male","female": 1 2 1 1 1 1 2 1 1 1 ...
 $ whours: num  40 40 40 40 42 39 40 39 40 39 ...
 $ educat: Factor w/ 9 levels "no degree","compulsory school",..: 8 5 3 2 5 3 7 3 3 3 ...
 $ income: num  18000 14500 18000 15700 38000 22000 5200 12000 15000 13000 ...
 $ hwage : num  112.5 90.6 112.5 98.1 237.5 ...
 $ edu   : num  15 12 10 9 12 10 13 10 10 10 ...
 $ potexp: num  3 25 11 22 32 34 11 44 29 10 ...
 - attr(*, "variable.labels")= Named chr [1:8] "age" "sex" "working hours" "education (levels)" ...
  ..- attr(*, "names")= chr [1:8] "age" "sex" "whours" "educat" ...
 - attr(*, "codepage")= int 1252

##Q2. Summary statistics for all variables
summary(inc_dat)

      age            sex          whours     
 Min.   :16.00   male  :839   Min.   :36.00  
 1st Qu.:28.00   female:432   1st Qu.:38.00  
 Median :36.00                Median :40.00  
 Mean   :36.78                Mean   :39.87  
 3rd Qu.:45.00                3rd Qu.:40.00  
 Max.   :64.00                Max.   :80.00  
                                             
                              educat        income          hwage       
 apprenticeship                  :599   Min.   : 5000   Min.   : 31.25  
 compulsory school               :220   1st Qu.:13000   1st Qu.: 81.25  
 vocational school               :127   Median :15000   Median : 93.75  
 vocational high school          :101   Mean   :16822   Mean   :105.14  
 tertiary education (BA, MA, PhD): 87   3rd Qu.:20000   3rd Qu.:125.00  
 academic high school            : 66   Max.   :80819   Max.   :505.12  
 (Other)                         : 71                                   
      edu            potexp     
 Min.   : 9.00   Min.   : 0.00  
 1st Qu.:10.00   1st Qu.:11.00  
 Median :10.00   Median :19.00  
 Mean   :10.95   Mean   :19.84  
 3rd Qu.:12.00   3rd Qu.:28.00  
 Max.   :17.00   Max.   :46.00

##Q3. Sequences with 'seq()' and 'rep()' (length = 20)
#Q3-a: 1 0 1 0 1 0 1 0 1 0 1 0
rep(c(1, 0), times=10)

 [1] 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0

#Q3-b: 1 1 0 0 1 1 0 0 1 1 0 0
rep(c(1,1,0,0), times=5)

 [1] 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0

# or:
#rep(rep(1:0), each = 2), times = 5)
#Q3-c: 0 3 6 9 0 3 6 9 0 3 6 9
rep(seq(0, 9, by = 3), times = 5)

 [1] 0 3 6 9 0 3 6 9 0 3 6 9 0 3 6 9 0 3 6 9

#Read the csv file used in class demos.
#Ensure the file "income_exmpl.csv" is in the EP763R folder.
incex <- read.csv("income_exmpl.csv", stringsAsFactors = TRUE) 

#Quick checks
#head(incex)
#str(incex)

##Q1. Ages of the 200th and 250th observations
#Select the 'age' values for observations 200 and 250
incex$age[c(200, 250)]

[1] 42 39

#or:
#incex[c(200, 250), "age"]

##Q2. Income for ages 25-35 with low educational level
#logical indexing: age between 25 and 35 inclusive AND edu == "low"
incex[incex$age >= 25 & incex$age <= 35 & incex$edu == "low","income"]

  [1]  940 1064  977 1141  973 1232 1094 1074 1005 1478  843 1070 1004 1091 1129
 [16] 1157 1309 1015  782 1091  887  853 1197 1060 1073  990 1009 1006 1182 1028
 [31]  804 1384  951 1298 1104  855 1189  942  888  977 1096 1109 1017  994 1103
 [46] 1381  819 1103 1347  962 1276 1354 1043  994  939 1260 1202 1222 1026  715
 [61] 1041 1304 1040  932  963 1042  789  909 1024  984  951 1159 1036 1137  791
 [76] 1222 1231  895  994 1078 1044  916  841  883 1180 1279 1263  979 1029 1096
 [91] 1195  859 1266 1102  954 1070  966 1146  942  996  888 1081  862  867 1373
[106]  704  981  955  972  907 1055  817 1067 1204 1089 1033  807  798  913  982
[121] 1123 1262 1013 1217 1080 1062  809 1233  999 1372 1015 1274  999 1055 1200
[136] 1128 1104 1017  904 1073  922 1185  879 1096 1022 1337 1074 1097  759 1148
[151] 1181 1194 1095 1096 1111 1059  792 1131 1176 1165  945 1167 1140  720 1074
[166] 1252 1108 1107 1118 1253 1018 1163 1095  911 1121  887 1196 1018  993 1316
[181] 1226  861 1329 1472 1043  948 1007  937 1222 1336  895 1196  997  829  994
[196]  903  772  977  875 1079  931 1007 1000 1120 1217 1051 1071  823 1222  978
[211] 1305 1193 1453 1136 1208  782  753 1075 1002 1256 1429

incex$income[incex$age >= 25 & incex$age <=35 & incex$edu == "low"]

  [1]  940 1064  977 1141  973 1232 1094 1074 1005 1478  843 1070 1004 1091 1129
 [16] 1157 1309 1015  782 1091  887  853 1197 1060 1073  990 1009 1006 1182 1028
 [31]  804 1384  951 1298 1104  855 1189  942  888  977 1096 1109 1017  994 1103
 [46] 1381  819 1103 1347  962 1276 1354 1043  994  939 1260 1202 1222 1026  715
 [61] 1041 1304 1040  932  963 1042  789  909 1024  984  951 1159 1036 1137  791
 [76] 1222 1231  895  994 1078 1044  916  841  883 1180 1279 1263  979 1029 1096
 [91] 1195  859 1266 1102  954 1070  966 1146  942  996  888 1081  862  867 1373
[106]  704  981  955  972  907 1055  817 1067 1204 1089 1033  807  798  913  982
[121] 1123 1262 1013 1217 1080 1062  809 1233  999 1372 1015 1274  999 1055 1200
[136] 1128 1104 1017  904 1073  922 1185  879 1096 1022 1337 1074 1097  759 1148
[151] 1181 1194 1095 1096 1111 1059  792 1131 1176 1165  945 1167 1140  720 1074
[166] 1252 1108 1107 1118 1253 1018 1163 1095  911 1121  887 1196 1018  993 1316
[181] 1226  861 1329 1472 1043  948 1007  937 1222 1336  895 1196  997  829  994
[196]  903  772  977  875 1079  931 1007 1000 1120 1217 1051 1071  823 1222  978
[211] 1305 1193 1453 1136 1208  782  753 1075 1002 1256 1429

with(incex, income[age >= 25 & age <=35 & edu == "low"])

  [1]  940 1064  977 1141  973 1232 1094 1074 1005 1478  843 1070 1004 1091 1129
 [16] 1157 1309 1015  782 1091  887  853 1197 1060 1073  990 1009 1006 1182 1028
 [31]  804 1384  951 1298 1104  855 1189  942  888  977 1096 1109 1017  994 1103
 [46] 1381  819 1103 1347  962 1276 1354 1043  994  939 1260 1202 1222 1026  715
 [61] 1041 1304 1040  932  963 1042  789  909 1024  984  951 1159 1036 1137  791
 [76] 1222 1231  895  994 1078 1044  916  841  883 1180 1279 1263  979 1029 1096
 [91] 1195  859 1266 1102  954 1070  966 1146  942  996  888 1081  862  867 1373
[106]  704  981  955  972  907 1055  817 1067 1204 1089 1033  807  798  913  982
[121] 1123 1262 1013 1217 1080 1062  809 1233  999 1372 1015 1274  999 1055 1200
[136] 1128 1104 1017  904 1073  922 1185  879 1096 1022 1337 1074 1097  759 1148
[151] 1181 1194 1095 1096 1111 1059  792 1131 1176 1165  945 1167 1140  720 1074
[166] 1252 1108 1107 1118 1253 1018 1163 1095  911 1121  887 1196 1018  993 1316
[181] 1226  861 1329 1472 1043  948 1007  937 1222 1336  895 1196  997  829  994
[196]  903  772  977  875 1079  931 1007 1000 1120 1217 1051 1071  823 1222  978
[211] 1305 1193 1453 1136 1208  782  753 1075 1002 1256 1429

subset(incex, age >=25 & age <=35 & edu == "low", select = income)

     income
21      940
23     1064
47      977
67     1141
123     973
125    1232
128    1094
161    1074
162    1005
170    1478
171     843
177    1070
182    1004
188    1091
221    1129
222    1157
239    1309
240    1015
248     782
259    1091
266     887
269     853
271    1197
274    1060
281    1073
283     990
296    1009
316    1006
321    1182
325    1028
333     804
345    1384
349     951
371    1298
386    1104
389     855
425    1189
428     942
435     888
442     977
444    1096
449    1109
455    1017
478     994
498    1103
502    1381
512     819
514    1103
535    1347
557     962
579    1276
583    1354
587    1043
592     994
594     939
595    1260
601    1202
603    1222
614    1026
616     715
633    1041
634    1304
642    1040
654     932
656     963
671    1042
672     789
683     909
697    1024
703     984
718     951
721    1159
727    1036
741    1137
744     791
762    1222
769    1231
788     895
793     994
796    1078
802    1044
804     916
815     841
818     883
820    1180
824    1279
828    1263
831     979
863    1029
866    1096
869    1195
890     859
897    1266
906    1102
910     954
933    1070
936     966
949    1146
969     942
972     996
974     888
976    1081
985     862
997     867
1001   1373
1004    704
1008    981
1052    955
1062    972
1064    907
1076   1055
1097    817
1098   1067
1100   1204
1139   1089
1140   1033
1154    807
1156    798
1157    913
1159    982
1162   1123
1164   1262
1168   1013
1175   1217
1188   1080
1200   1062
1218    809
1229   1233
1232    999
1242   1372
1247   1015
1248   1274
1251    999
1255   1055
1264   1200
1269   1128
1277   1104
1279   1017
1281    904
1284   1073
1287    922
1312   1185
1316    879
1330   1096
1334   1022
1339   1337
1346   1074
1353   1097
1354    759
1355   1148
1358   1181
1365   1194
1390   1095
1394   1096
1408   1111
1409   1059
1442    792
1445   1131
1455   1176
1456   1165
1468    945
1475   1167
1485   1140
1499    720
1502   1074
1507   1252
1514   1108
1520   1107
1525   1118
1532   1253
1537   1018
1538   1163
1546   1095
1576    911
1586   1121
1592    887
1593   1196
1594   1018
1595    993
1601   1316
1608   1226
1622    861
1627   1329
1636   1472
1667   1043
1673    948
1676   1007
1688    937
1697   1222
1700   1336
1705    895
1713   1196
1730    997
1738    829
1755    994
1764    903
1773    772
1790    977
1810    875
1813   1079
1818    931
1824   1007
1827   1000
1830   1120
1839   1217
1841   1051
1846   1071
1850    823
1853   1222
1867    978
1876   1305
1877   1193
1883   1453
1892   1136
1897   1208
1901    782
1905    753
1908   1075
1915   1002
1917   1256
1918   1429

##Q3. Subset where 'occ' is medium or high and 'oexp' > 45
subset(incex, (occ == "med." | occ == "high") & oexp > 45)

     sex age edu  occ oexp income
227    f  64 low med.   47   1139
481    m  65 low high   47   1532
858    m  65 low med.   48   1520
1132   m  63 low med.   46   1462
1173   m  65 low high   47   1591
1368   m  65 low med.   48   1368
1383   m  62 low high   46   1800
1718   m  64 low high   46   1607
1747   m  64 low high   46   1442

subset(incex, occ != "low" & oexp > 45)

     sex age edu  occ oexp income
227    f  64 low med.   47   1139
481    m  65 low high   47   1532
858    m  65 low med.   48   1520
1132   m  63 low med.   46   1462
1173   m  65 low high   47   1591
1368   m  65 low med.   48   1368
1383   m  62 low high   46   1800
1718   m  64 low high   46   1607
1747   m  64 low high   46   1442

##Q4. Mean occupational experience by gender among low
#Version 1: subset then tapply()
incex_lowedu <- subset(incex, edu == "low")
tapply(incex_lowedu$oexp, incex_lowedu$sex, mean, na.rm = TRUE)

       f        m 
20.21884 24.78667

#Version 2: tapply with inline subsetting
with(incex[incex$edu == "low",], tapply(oexp, sex, mean, na.rm = TRUE))

       f        m 
20.21884 24.78667

#Version 3 : two-way table then select the "low" column
with(incex, tapply(oexp, list(sex, edu), mean, na.rm = TRUE))[ , "low"]

       f        m 
20.21884 24.78667

#Version 4: vectorized subsetting inside tapply()
tapply(incex$oexp[incex$edu == "low"], incex$sex[incex$edu == "low"], mean, na.rm = TRUE)

       f        m 
20.21884 24.78667

with(incex, tapply(oexp[edu == "low"], sex[edu == "low"], mean, na.rm = TRUE))

       f        m 
20.21884 24.78667