# Set environmental variables
Sys.setenv(HADOOP_CMD="/usr/bin/hadoop")
Sys.setenv(HADOOP_STREAMING="/usr/hdp/2.3.0.0-2557/hadoop-mapreduce/hadoop-streaming-2.7.1.2.3.0.0-2557.jar")
library(rhdfs)
## Loading required package: rJava
##
## HADOOP_CMD=/usr/bin/hadoop
##
## Be sure to run hdfs.init()
library(rmr2)
## Warning: S3 methods 'gorder.default', 'gorder.factor',
## 'gorder.data.frame', 'gorder.matrix', 'gorder.raw' were declared in
## NAMESPACE but not found
## Please review your hadoop settings. See help(hadoop.settings)
# initialize the connection from rstudio to hadoop
hdfs.init()
csv.input.format = make.input.format(format='csv', mode='text', sep=',')
# Specify the path
hdfs.root = '/user/share/student'
# append the data filename to the pathname
hdfs.data = file.path(hdfs.root, 'test_25K.csv')
#### Problem 1 ####
plane_origin_canceled_map = function(k,lines) {
origin = lines[[17]]
canceled = as.numeric(lines[[22]])
return (keyval(origin,canceled))
}
plane_origin_canceled_reduce = function(origin, counts) {
keyval(origin, sum(counts,na.rm=TRUE))
}
plane_origin_canceled = function(origin_canceled_input, origin_canceled_output = NULL) {
mapreduce(input = origin_canceled_input,
output = origin_canceled_output,
input.format = csv.input.format,
map = plane_origin_canceled_map,
reduce = plane_origin_canceled_reduce)
}
# I was tired of deleting old files in the hadoop filesystem all the time
# So I have my outputs appended with the date and time
outfile_origin_canceled = paste('origin_canceled',format(Sys.time(),"%b_%d_%Y_%H_%M%p"),sep="_")
# append the output filename to the pathname
hdfs_origin_canceled.out = file.path(hdfs.root, outfile_origin_canceled)
out_origin_canceled = plane_origin_canceled(hdfs.data, hdfs_origin_canceled.out)
results_origin_canceled = as.data.frame(from.dfs(out_origin_canceled), stringsAsFactors=F)
colnames(results_origin_canceled) = c('Origin', 'Canceled')
print(results_origin_canceled)
## Origin Canceled
## 1 ABE 0
## 2 ABI 1
## 3 ABQ 1
## 4 ABY 1
## 5 ACK 0
## 6 ACT 0
## 7 ACV 1
## 8 ACY 0
## 9 ADQ 1
## 10 AEX 0
## 11 AGS 2
## 12 ALB 1
## 13 AMA 0
## 14 ANC 1
## 15 ATL 26
## 16 ATW 0
## 17 AUS 4
## 18 AVL 1
## 19 AVP 0
## 20 AZO 1
## 21 BDL 0
## 22 BET 1
## 23 BFL 0
## 24 BGM 0
## 25 BGR 1
## 26 BHM 0
## 27 BIL 0
## 28 BIS 0
## 29 BMI 0
## 30 BNA 4
## 31 BOI 0
## 32 BOS 22
## 33 BPT 0
## 34 BQK 0
## 35 BQN 0
## 36 BRO 0
## 37 BRW 0
## 38 BTM 0
## 39 BTR 0
## 40 BTV 1
## 41 BUF 1
## 42 BUR 1
## 43 BWI 6
## 44 BZN 0
## 45 CAE 1
## 46 CAK 0
## 47 CDC 0
## 48 CDV 0
## 49 CEC 1
## 50 CHA 1
## 51 CHO 0
## 52 CHS 4
## 53 CIC 0
## 54 CID 2
## 55 CLD 0
## 56 CLE 3
## 57 CLL 0
## 58 CLT 5
## 59 CMH 5
## 60 CMI 0
## 61 COD 1
## 62 COS 1
## 63 CPR 0
## 64 CRP 0
## 65 CRW 0
## 66 CSG 1
## 67 CVG 24
## 68 DAB 2
## 69 DAL 7
## 70 DAY 2
## 71 DBQ 1
## 72 DCA 6
## 73 DEN 6
## 74 DFW 11
## 75 DHN 1
## 76 DLH 0
## 77 DRO 0
## 78 DSM 1
## 79 DTW 6
## 80 EFD 0
## 81 EGE 0
## 82 EKO 0
## 83 ELP 1
## 84 ERI 0
## 85 EUG 0
## 86 EVV 0
## 87 EWR 16
## 88 EYW 0
## 89 FAI 2
## 90 FAR 0
## 91 FAT 1
## 92 FAY 0
## 93 FCA 0
## 94 FLL 2
## 95 FNT 0
## 96 FSD 2
## 97 FSM 0
## 98 FWA 0
## 99 GEG 0
## 100 GFK 0
## 101 GGG 0
## 102 GJT 0
## 103 GNV 0
## 104 GPT 0
## 105 GRB 0
## 106 GRK 0
## 107 GRR 0
## 108 GSO 4
## 109 GSP 5
## 110 GST 0
## 111 GTF 0
## 112 GTR 1
## 113 GUC 0
## 114 HDN 0
## 115 HLN 0
## 116 HNL 0
## 117 HOU 12
## 118 HPN 0
## 119 HRL 0
## 120 HSV 0
## 121 HTS 1
## 122 HVN 1
## 123 IAD 12
## 124 IAH 7
## 125 ICT 2
## 126 IDA 0
## 127 ILE 0
## 128 ILM 0
## 129 IND 3
## 130 IPL 0
## 131 ISP 1
## 132 ITO 0
## 133 IYK 0
## 134 JAC 1
## 135 JAN 0
## 136 JAX 4
## 137 JFK 6
## 138 JNU 1
## 139 KOA 0
## 140 KTN 0
## 141 LAN 0
## 142 LAS 4
## 143 LAW 0
## 144 LAX 9
## 145 LBB 1
## 146 LCH 0
## 147 LEX 2
## 148 LFT 0
## 149 LGA 7
## 150 LGB 0
## 151 LIH 0
## 152 LIT 0
## 153 LNK 0
## 154 LNY 0
## 155 LRD 0
## 156 LSE 0
## 157 LWB 0
## 158 LYH 0
## 159 MAF 0
## 160 MBS 0
## 161 MCI 2
## 162 MCN 0
## 163 MCO 8
## 164 MDT 0
## 165 MDW 1
## 166 MEI 0
## 167 MEM 0
## 168 MFE 0
## 169 MFR 0
## 170 MGM 0
## 171 MHT 0
## 172 MIA 5
## 173 MKE 1
## 174 MKK 0
## 175 MLB 1
## 176 MLI 0
## 177 MLU 0
## 178 MOB 0
## 179 MOD 0
## 180 MOT 0
## 181 MQT 0
## 182 MRY 1
## 183 MSN 0
## 184 MSO 1
## 185 MSP 11
## 186 MSY 4
## 187 MTJ 0
## 188 MYR 2
## 189 OAK 1
## 190 OGG 0
## 191 OKC 2
## 192 OMA 2
## 193 OME 1
## 194 ONT 1
## 195 ORD 45
## 196 ORF 1
## 197 OTZ 0
## 198 OXR 0
## 199 PBI 0
## 200 PDX 3
## 201 PFN 0
## 202 PHF 0
## 203 PHL 6
## 204 PHX 7
## 205 PIA 0
## 206 PIE 0
## 207 PIH 0
## 208 PIT 2
## 209 PNS 0
## 210 PSC 0
## 211 PSG 0
## 212 PSP 1
## 213 PVD 1
## 214 PWM 2
## 215 RAP 0
## 216 RDD 0
## 217 RDM 0
## 218 RDU 6
## 219 RIC 3
## 220 RNO 1
## 221 ROA 2
## 222 ROC 2
## 223 RST 0
## 224 RSW 2
## 225 SAN 6
## 226 SAT 3
## 227 SAV 1
## 228 SBA 0
## 229 SBN 0
## 230 SBP 0
## 231 SCC 0
## 232 SCE 1
## 233 SDF 0
## 234 SEA 5
## 235 SFO 4
## 236 SGF 0
## 237 SGU 0
## 238 SHV 0
## 239 SIT 0
## 240 SJC 3
## 241 SJT 0
## 242 SJU 3
## 243 SLC 6
## 244 SMF 2
## 245 SMX 0
## 246 SNA 4
## 247 SPS 0
## 248 SRQ 1
## 249 STL 3
## 250 STT 1
## 251 STX 0
## 252 SUN 0
## 253 SWF 0
## 254 SYR 0
## 255 TLH 2
## 256 TOL 0
## 257 TPA 8
## 258 TRI 0
## 259 TUL 2
## 260 TUS 0
## 261 TVC 1
## 262 TWF 0
## 263 TXK 0
## 264 TYR 0
## 265 TYS 3
## 266 VCT 0
## 267 VIS 0
## 268 VLD 0
## 269 VPS 0
## 270 WRG 0
## 271 XNA 3
## 272 YAK 0
## 273 YUM 0
#### Problem 2 ####
plane_dest_taxi_map = function(k,lines) {
destination = lines[[18]]
TaxiIN = as.numeric(lines[[20]])
return (keyval(destination,TaxiIN))
}
plane_dest_taxi_reduce = function(origin, counts) {
keyval(origin, mean(counts,na.rm=TRUE))
}
plane_dest_taxi = function(origin_canceled_input, origin_canceled_output = NULL) {
mapreduce(input = origin_canceled_input,
output = origin_canceled_output,
input.format = csv.input.format,
map = plane_dest_taxi_map,
reduce = plane_dest_taxi_reduce)
}
# I was tired of deleting old files in the hadoop filesystem all the time
# So I have my outputs appended with the date and time
outfile_dest_taxi = paste('dest_taxi',format(Sys.time(),"%b_%d_%Y_%H_%M%p"),sep="_")
# append the output filename to the pathname
hdfs_dest_taxi.out = file.path(hdfs.root, outfile_dest_taxi)
out_dest_taxi = plane_dest_taxi(hdfs.data, hdfs_dest_taxi.out)
results_dest_taxi = as.data.frame(from.dfs(out_dest_taxi), stringsAsFactors=F)
colnames(results_dest_taxi) = c('Destination', 'TaxiIN')
print(results_dest_taxi)
## Destination TaxiIN
## 1 ABE 3.904762
## 2 ABI 2.333333
## 3 ABQ 4.633803
## 4 ABY 3.333333
## 5 ACK 6.000000
## 6 ACT 4.250000
## 7 ACV 2.636364
## 8 ACY 3.000000
## 9 ADQ 2.500000
## 10 AEX 3.363636
## 11 AGS 2.615385
## 12 AKN 2.500000
## 13 ALB 3.575758
## 14 AMA 3.966667
## 15 ANC 4.697368
## 16 ATL 14.749306
## 17 ATW 3.666667
## 18 AUS 4.492857
## 19 AVL 2.444444
## 20 AVP 3.307692
## 21 AZO 3.312500
## 22 BDL 4.760870
## 23 BET 2.250000
## 24 BFL 5.666667
## 25 BGM 14.600000
## 26 BGR 4.000000
## 27 BHM 3.734177
## 28 BIL 4.550000
## 29 BIS 4.000000
## 30 BMI 3.833333
## 31 BNA 5.080402
## 32 BOI 3.593220
## 33 BOS 7.284783
## 34 BPT 2.400000
## 35 BQK 3.000000
## 36 BQN 4.250000
## 37 BRO 3.500000
## 38 BRW 2.666667
## 39 BTM 4.000000
## 40 BTR 4.263158
## 41 BTV 3.868421
## 42 BUF 4.425743
## 43 BUR 2.590000
## 44 BWI 5.224932
## 45 BZN 3.714286
## 46 CAE 38.238095
## 47 CAK 4.815789
## 48 CDC 3.000000
## 49 CDV 4.000000
## 50 CEC 1.833333
## 51 CHA 2.800000
## 52 CHO 2.363636
## 53 CHS 5.649123
## 54 CIC 2.500000
## 55 CID 3.888889
## 56 CLD 3.200000
## 57 CLE 5.646884
## 58 CLL 6.764706
## 59 CLT 4.759162
## 60 CMH 4.564885
## 61 CMI 3.818182
## 62 COD 3.000000
## 63 COS 6.250000
## 64 CPR 4.000000
## 65 CRP 3.720000
## 66 CRW 3.240000
## 67 CSG 2.500000
## 68 CVG 8.266491
## 69 DAB 3.400000
## 70 DAL 2.993750
## 71 DAY 4.238806
## 72 DCA 7.492795
## 73 DEN 8.079929
## 74 DFW 15.217496
## 75 DHN 3.700000
## 76 DLG 4.000000
## 77 DLH 5.222222
## 78 DRO 2.000000
## 79 DSM 4.615385
## 80 DTW 9.264706
## 81 EFD 3.500000
## 82 EGE 3.600000
## 83 EKO 3.666667
## 84 ELP 3.366667
## 85 ERI 3.333333
## 86 EUG 3.294118
## 87 EVV 3.941176
## 88 EWR 8.128253
## 89 EYW 2.666667
## 90 FAI 3.851852
## 91 FAR 4.875000
## 92 FAT 3.750000
## 93 FAY 182.750000
## 94 FCA 3.900000
## 95 FLL 4.495968
## 96 FLO 5.400000
## 97 FNT 4.241379
## 98 FSD 3.900000
## 99 FSM 6.000000
## 100 FWA 2.782609
## 101 GEG 4.377778
## 102 GFK 5.600000
## 103 GGG 4.400000
## 104 GJT 2.500000
## 105 GNV 146.700000
## 106 GPT 5.055556
## 107 GRB 3.678571
## 108 GRK 3.750000
## 109 GRR 4.943396
## 110 GSO 3.873239
## 111 GSP 3.365854
## 112 GTF 3.666667
## 113 GTR 2.333333
## 114 HDN 3.000000
## 115 HLN 3.444444
## 116 HNL 5.321429
## 117 HOU 4.117647
## 118 HPN 4.966667
## 119 HRL 3.086957
## 120 HSV 4.833333
## 121 HTS 2.000000
## 122 HVN 3.500000
## 123 IAD 8.997930
## 124 IAH 9.780899
## 125 ICT 36.666667
## 126 IDA 2.714286
## 127 ILE 4.615385
## 128 ILM 3.090909
## 129 IND 7.703448
## 130 IPL 2.200000
## 131 ISP 3.470588
## 132 ITO 4.285714
## 133 IYK 3.250000
## 134 JAC 2.500000
## 135 JAN 87.288462
## 136 JAX 4.947368
## 137 JFK 8.664557
## 138 JNU 3.750000
## 139 KOA 4.074074
## 140 KTN 4.200000
## 141 LAN 3.000000
## 142 LAS 5.778163
## 143 LAW 3.833333
## 144 LAX 7.597418
## 145 LBB 3.166667
## 146 LCH 4.875000
## 147 LEX 3.852941
## 148 LFT 3.818182
## 149 LGA 8.155211
## 150 LGB 5.577778
## 151 LIH 4.920000
## 152 LIT 26.409836
## 153 LNK 4.000000
## 154 LRD 2.857143
## 155 LSE 5.666667
## 156 LWB 8.000000
## 157 LYH 3.400000
## 158 MAF 2.894737
## 159 MBS 3.642857
## 160 MCI 4.315789
## 161 MCN 3.222222
## 162 MCO 6.913158
## 163 MDT 4.325581
## 164 MDW 5.895082
## 165 MEI 3.000000
## 166 MEM 5.732143
## 167 MFE 4.300000
## 168 MFR 3.230769
## 169 MGM 3.200000
## 170 MHT 4.473684
## 171 MIA 8.027523
## 172 MKE 4.938462
## 173 MKK 3.000000
## 174 MLB 6.000000
## 175 MLI 4.307692
## 176 MLU 3.500000
## 177 MOB 3.615385
## 178 MOD 3.000000
## 179 MOT 5.000000
## 180 MQT 8.000000
## 181 MRY 3.357143
## 182 MSN 3.548387
## 183 MSO 4.800000
## 184 MSP 6.565619
## 185 MSY 3.614973
## 186 MTJ 3.000000
## 187 MYR 55.035714
## 188 OAK 4.987903
## 189 OGG 4.500000
## 190 OKC 4.213483
## 191 OMA 3.770270
## 192 OME 3.000000
## 193 ONT 3.915385
## 194 ORD 9.239912
## 195 ORF 20.823529
## 196 OTZ 2.500000
## 197 OXR 2.875000
## 198 PBI 19.020408
## 199 PDX 3.720588
## 200 PFN 3.272727
## 201 PHF 5.555556
## 202 PHL 7.063492
## 203 PHX 5.903169
## 204 PIA 3.437500
## 205 PIE 4.666667
## 206 PIH 2.166667
## 207 PIT 6.303318
## 208 PNS 4.000000
## 209 PSC 4.157895
## 210 PSG 2.666667
## 211 PSP 4.000000
## 212 PVD 3.921569
## 213 PWM 4.800000
## 214 RAP 4.166667
## 215 RDD 3.166667
## 216 RDM 2.416667
## 217 RDU 5.174468
## 218 RIC 5.051724
## 219 RNO 4.441441
## 220 ROA 4.100000
## 221 ROC 5.385965
## 222 RST 4.375000
## 223 RSW 3.573529
## 224 SAN 3.682119
## 225 SAT 3.137931
## 226 SAV 4.705882
## 227 SBA 3.956522
## 228 SBN 2.272727
## 229 SBP 3.000000
## 230 SCE 3.285714
## 231 SDF 4.974026
## 232 SEA 5.883085
## 233 SFO 5.452632
## 234 SGF 3.629630
## 235 SGU 2.750000
## 236 SHV 4.666667
## 237 SIT 5.250000
## 238 SJC 3.928000
## 239 SJT 4.500000
## 240 SJU 5.053763
## 241 SLC 5.988327
## 242 SMF 4.078652
## 243 SMX 2.833333
## 244 SNA 6.193548
## 245 SPS 7.000000
## 246 SRQ 4.166667
## 247 STL 4.422680
## 248 STT 3.333333
## 249 STX 3.000000
## 250 SUN 1.900000
## 251 SWF 4.900000
## 252 SYR 4.745098
## 253 TLH 4.192308
## 254 TOL 3.526316
## 255 TPA 4.430712
## 256 TRI 3.555556
## 257 TUL 3.226667
## 258 TUS 4.054795
## 259 TVC 3.272727
## 260 TWF 4.000000
## 261 TXK 3.000000
## 262 TYR 4.266667
## 263 TYS 3.550000
## 264 VCT 3.500000
## 265 VIS 3.000000
## 266 VLD 2.750000
## 267 VPS 8.666667
## 268 WRG 4.666667
## 269 XNA 34.270833
## 270 YAK 2.500000
## 271 YUM 3.000000