steel <- read.csv('steel_import_1.csv',skip=15, stringsAsFactors = F, header= FALSE)[1:47,]
names(steel) <- c('Country_areas','201810_prequantity','201810_prevalue',
'201809_prequantity','201809_prevalue',
'201809_finalquantity','201809_finalvalue',
'201710_finalquantity','201710_finalvalue',
'201709_finalquantity','201709_finalvalue')
head(steel)
## Country_areas 201810_prequantity 201810_prevalue 201809_prequantity
## 1 North America 739346 673946 637423
## 2 Canada 409300 403874 380465
## 3 Mexico 330046 270073 256958
## 4 Europe 682053 738322 673569
## 5 European Union 427571 541635 390937
## 6 Austria 25793 38920 25552
## 201809_prevalue 201809_finalquantity 201809_finalvalue
## 1 611864 638543 613821
## 2 380754 380484 380790
## 3 231110 258059 233032
## 4 681575 675161 682885
## 5 487953 392240 488745
## 6 35841 25621 35954
## 201710_finalquantity 201710_finalvalue 201709_finalquantity
## 1 743537 665119 729999
## 2 482878 454037 435284
## 3 260659 211081 294716
## 4 820708 794907 804507
## 5 435696 563048 455216
## 6 17302 30929 25836
## 201709_finalvalue
## 1 639544
## 2 414721
## 3 224823
## 4 750333
## 5 536615
## 6 41803
steel[steel=='(-)']<-NA
steel[,2:11] = steel%>%
select(2:11)%>%
# mutate_all(funs(str_replace(., ",", "")))%>%
mutate_all(as.numeric)
head(steel)
## Country_areas 201810_prequantity 201810_prevalue 201809_prequantity
## 1 North America 739346 673946 637423
## 2 Canada 409300 403874 380465
## 3 Mexico 330046 270073 256958
## 4 Europe 682053 738322 673569
## 5 European Union 427571 541635 390937
## 6 Austria 25793 38920 25552
## 201809_prevalue 201809_finalquantity 201809_finalvalue
## 1 611864 638543 613821
## 2 380754 380484 380790
## 3 231110 258059 233032
## 4 681575 675161 682885
## 5 487953 392240 488745
## 6 35841 25621 35954
## 201710_finalquantity 201710_finalvalue 201709_finalquantity
## 1 743537 665119 729999
## 2 482878 454037 435284
## 3 260659 211081 294716
## 4 820708 794907 804507
## 5 435696 563048 455216
## 6 17302 30929 25836
## 201709_finalvalue
## 1 639544
## 2 414721
## 3 224823
## 4 750333
## 5 536615
## 6 41803
steel_final_1 = gather(steel,year_month_type, Number,2:11, factor_key = TRUE)%>%
separate(year_month_type, c('year_month','quant_value'),sep='_')%>%
spread(quant_value,'Number')
head(steel_final_1,10)
## Country_areas year_month finalquantity finalvalue prequantity prevalue
## 1 Argentina 201709 13724 16354 NA NA
## 2 Argentina 201710 20635 21591 NA NA
## 3 Argentina 201809 12559 18054 12559 18054
## 4 Argentina 201810 NA NA 10130 12041
## 5 Australia 201709 22087 17841 NA NA
## 6 Australia 201710 21412 15981 NA NA
## 7 Australia 201809 18775 17519 18775 17519
## 8 Australia 201810 NA NA 28739 28244
## 9 Austria 201709 25836 41803 NA NA
## 10 Austria 201710 17302 30929 NA NA
steel_final_2 = gather(steel,year_month_type, Number,2:11, factor_key = TRUE)%>%
separate(year_month_type, c('year_month','quant_value'),sep='_')%>%
spread(year_month,'Number')
head(steel_final_2,10)
## Country_areas quant_value 201709 201710 201809 201810
## 1 Argentina finalquantity 13724 20635 12559 NA
## 2 Argentina finalvalue 16354 21591 18054 NA
## 3 Argentina prequantity NA NA 12559 10130
## 4 Argentina prevalue NA NA 18054 12041
## 5 Australia finalquantity 22087 21412 18775 NA
## 6 Australia finalvalue 17841 15981 17519 NA
## 7 Australia prequantity NA NA 18775 28739
## 8 Australia prevalue NA NA 17519 28244
## 9 Austria finalquantity 25836 17302 25621 NA
## 10 Austria finalvalue 41803 30929 35954 NA
final_quantity = steel_final_2%>%
filter(quant_value =='finalquantity')%>%
rowwise()%>%
mutate(finalquantity_overall= sum(`201709`,`201710`,`201809`,`201810`,na.rm=TRUE))%>%
arrange(desc(finalquantity_overall))
head(final_quantity)
## # A tibble: 6 x 7
## Country_areas quant_value `201709` `201710` `201809` `201810`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Europe finalquant… 804507 820708 675161 NA
## 2 North America finalquant… 729999 743537 638543 NA
## 3 Pacific Rim … finalquant… 691501 662335 418794 NA
## 4 Canada finalquant… 435284 482878 380484 NA
## 5 European Uni… finalquant… 455216 435696 392240 NA
## 6 Euro Area finalquant… 396011 358825 292800 NA
## # ... with 1 more variable: finalquantity_overall <dbl>
final_value = steel_final_2%>%
filter(quant_value =='finalvalue')%>%
rowwise()%>%
mutate(finalvalue_overall= sum(`201709`,`201710`,`201809`,`201810`,na.rm=TRUE))%>%
arrange(desc(finalvalue_overall))
head(final_value)
## # A tibble: 6 x 7
## Country_areas quant_value `201709` `201710` `201809` `201810`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Europe finalvalue 750333 794907 682885 NA
## 2 North America finalvalue 639544 665119 613821 NA
## 3 Pacific Rim … finalvalue 667787 657377 473571 NA
## 4 European Uni… finalvalue 536615 563048 488745 NA
## 5 Euro Area finalvalue 447539 450147 364049 NA
## 6 Canada finalvalue 414721 454037 380790 NA
## # ... with 1 more variable: finalvalue_overall <dbl>
Europe has the highest overall quantity.
final_quantity$Country_areas <- factor(final_quantity$Country_areas, levels = final_quantity$Country_areas[order(final_quantity$finalquantity_overall)])
g=ggplot(final_quantity,aes(y=finalquantity_overall,x=Country_areas,label=Country_areas))
g+geom_bar(stat='identity', aes(fill=finalquantity_overall), width=.5)+
coord_flip()
Europe has the highest overall value.
final_value$Country_areas <- factor(final_value$Country_areas, levels = final_value$Country_areas[order(final_value$finalvalue_overall)])
g=ggplot(final_value,aes(y=finalvalue_overall,x=Country_areas,label=Country_areas))
g+geom_bar(stat='identity', aes(fill=finalvalue_overall), width=.5)+
coord_flip()
Brazil has the hightest unit price while Singapore has the lowest unit price.
unit_value=data.frame(final_value$Country_areas,final_quantity$finalquantity_overall,final_value$finalvalue_overall)
names(unit_value) = c('country','quantity','value')
unit_value=unit_value%>%
mutate(unit_value = quantity /value )
unit_value$country <- factor(unit_value$country, levels = unit_value$country[order(unit_value$unit_value)])
g1=ggplot(unit_value,aes(y=unit_value,x=country,label=country))
g1+geom_bar(stat='identity', aes(fill=unit_value), width=.5)+
coord_flip()
## Warning: Removed 1 rows containing missing values (position_stack).
#####3. What does unit value distribution looks like?
The unit value distribution is a normal distribution with slightly left skewed.
The average unit value is 0.9373.
ggplot(unit_value, aes(x=unit_value)) +
geom_histogram(binwidth = 0.1, color="darkblue", fill="lightblue")+
geom_density(alpha=.2, fill="#FF6666")
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing non-finite values (stat_density).
summary(unit_value)
## country quantity value unit_value
## Singapore : 1 Min. : 0 Min. : 0 Min. :0.2360
## Colombia : 1 1st Qu.: 27938 1st Qu.: 31471 1st Qu.:0.8046
## Hong Kong : 1 Median : 86679 Median : 104412 Median :0.9368
## Norway : 1 Mean : 377578 Mean : 361586 Mean :0.9373
## Poland : 1 3rd Qu.: 515234 3rd Qu.: 407678 3rd Qu.:1.0529
## United Kingdom: 1 Max. :2300376 Max. :2228125 Max. :1.4819
## (Other) :41 NA's :1
Most quantity difference falls in 0-100 bin. This is right skewed distribution.
diff = steel_final_1%>%
mutate(quantity_diff = finalquantity - prequantity,
value_diff = finalvalue - prevalue)%>%
filter(!is.na(quantity_diff))
ggplot(diff, aes(x=quantity_diff)) +
geom_histogram(binwidth = 100, color="darkblue", fill="lightblue")+
geom_density(alpha=.2, fill="#FF6666")