Read data from csv file , skip the first 15 rows

steel <- read.csv('steel_import_1.csv',skip=15, stringsAsFactors = F, header= FALSE)[1:47,]

DATA CLEAN

Rename columns

names(steel) <- c('Country_areas','201810_prequantity','201810_prevalue',
                                  '201809_prequantity','201809_prevalue',
                                  '201809_finalquantity','201809_finalvalue',
                                  '201710_finalquantity','201710_finalvalue',
                                  '201709_finalquantity','201709_finalvalue')
head(steel)

##    Country_areas 201810_prequantity 201810_prevalue 201809_prequantity
## 1  North America             739346          673946             637423
## 2         Canada             409300          403874             380465
## 3         Mexico             330046          270073             256958
## 4         Europe             682053          738322             673569
## 5 European Union             427571          541635             390937
## 6        Austria              25793           38920              25552
##   201809_prevalue 201809_finalquantity 201809_finalvalue
## 1          611864               638543            613821
## 2          380754               380484            380790
## 3          231110               258059            233032
## 4          681575               675161            682885
## 5          487953               392240            488745
## 6           35841                25621             35954
##   201710_finalquantity 201710_finalvalue 201709_finalquantity
## 1               743537            665119               729999
## 2               482878            454037               435284
## 3               260659            211081               294716
## 4               820708            794907               804507
## 5               435696            563048               455216
## 6                17302             30929                25836
##   201709_finalvalue
## 1            639544
## 2            414721
## 3            224823
## 4            750333
## 5            536615
## 6             41803

Replace (-) with NA and convert to number,

steel[steel=='(-)']<-NA
steel[,2:11] = steel%>%
      select(2:11)%>%
      # mutate_all(funs(str_replace(., ",", "")))%>%  
      mutate_all(as.numeric)
head(steel)

##    Country_areas 201810_prequantity 201810_prevalue 201809_prequantity
## 1  North America             739346          673946             637423
## 2         Canada             409300          403874             380465
## 3         Mexico             330046          270073             256958
## 4         Europe             682053          738322             673569
## 5 European Union             427571          541635             390937
## 6        Austria              25793           38920              25552
##   201809_prevalue 201809_finalquantity 201809_finalvalue
## 1          611864               638543            613821
## 2          380754               380484            380790
## 3          231110               258059            233032
## 4          681575               675161            682885
## 5          487953               392240            488745
## 6           35841                25621             35954
##   201710_finalquantity 201710_finalvalue 201709_finalquantity
## 1               743537            665119               729999
## 2               482878            454037               435284
## 3               260659            211081               294716
## 4               820708            794907               804507
## 5               435696            563048               455216
## 6                17302             30929                25836
##   201709_finalvalue
## 1            639544
## 2            414721
## 3            224823
## 4            750333
## 5            536615
## 6             41803

Transform dataset

steel_final_1 = gather(steel,year_month_type, Number,2:11, factor_key = TRUE)%>%
                separate(year_month_type, c('year_month','quant_value'),sep='_')%>%
                spread(quant_value,'Number')
head(steel_final_1,10)

##    Country_areas year_month finalquantity finalvalue prequantity prevalue
## 1      Argentina     201709         13724      16354          NA       NA
## 2      Argentina     201710         20635      21591          NA       NA
## 3      Argentina     201809         12559      18054       12559    18054
## 4      Argentina     201810            NA         NA       10130    12041
## 5      Australia     201709         22087      17841          NA       NA
## 6      Australia     201710         21412      15981          NA       NA
## 7      Australia     201809         18775      17519       18775    17519
## 8      Australia     201810            NA         NA       28739    28244
## 9        Austria     201709         25836      41803          NA       NA
## 10       Austria     201710         17302      30929          NA       NA

steel_final_2 = gather(steel,year_month_type, Number,2:11, factor_key = TRUE)%>%
                separate(year_month_type, c('year_month','quant_value'),sep='_')%>%
                spread(year_month,'Number')
head(steel_final_2,10)

##    Country_areas   quant_value 201709 201710 201809 201810
## 1      Argentina finalquantity  13724  20635  12559     NA
## 2      Argentina    finalvalue  16354  21591  18054     NA
## 3      Argentina   prequantity     NA     NA  12559  10130
## 4      Argentina      prevalue     NA     NA  18054  12041
## 5      Australia finalquantity  22087  21412  18775     NA
## 6      Australia    finalvalue  17841  15981  17519     NA
## 7      Australia   prequantity     NA     NA  18775  28739
## 8      Australia      prevalue     NA     NA  17519  28244
## 9        Austria finalquantity  25836  17302  25621     NA
## 10       Austria    finalvalue  41803  30929  35954     NA

DATA ANALYSIS

Questions：

1. Which country/area has the overall largest final quantity / value

final_quantity = steel_final_2%>%
  filter(quant_value =='finalquantity')%>%
  rowwise()%>%
  mutate(finalquantity_overall= sum(`201709`,`201710`,`201809`,`201810`,na.rm=TRUE))%>%
  arrange(desc(finalquantity_overall))
head(final_quantity)

## # A tibble: 6 x 7
##   Country_areas quant_value `201709` `201710` `201809` `201810`
##   <chr>         <chr>          <dbl>    <dbl>    <dbl>    <dbl>
## 1 Europe        finalquant…   804507   820708   675161       NA
## 2 North America finalquant…   729999   743537   638543       NA
## 3 Pacific Rim … finalquant…   691501   662335   418794       NA
## 4 Canada        finalquant…   435284   482878   380484       NA
## 5 European Uni… finalquant…   455216   435696   392240       NA
## 6 Euro Area     finalquant…   396011   358825   292800       NA
## # ... with 1 more variable: finalquantity_overall <dbl>

final_value = steel_final_2%>%
  filter(quant_value =='finalvalue')%>%
  rowwise()%>%
  mutate(finalvalue_overall= sum(`201709`,`201710`,`201809`,`201810`,na.rm=TRUE))%>%
  arrange(desc(finalvalue_overall))
head(final_value)

## # A tibble: 6 x 7
##   Country_areas quant_value `201709` `201710` `201809` `201810`
##   <chr>         <chr>          <dbl>    <dbl>    <dbl>    <dbl>
## 1 Europe        finalvalue    750333   794907   682885       NA
## 2 North America finalvalue    639544   665119   613821       NA
## 3 Pacific Rim … finalvalue    667787   657377   473571       NA
## 4 European Uni… finalvalue    536615   563048   488745       NA
## 5 Euro Area     finalvalue    447539   450147   364049       NA
## 6 Canada        finalvalue    414721   454037   380790       NA
## # ... with 1 more variable: finalvalue_overall <dbl>

Europe has the highest overall quantity.

final_quantity$Country_areas <- factor(final_quantity$Country_areas, levels = final_quantity$Country_areas[order(final_quantity$finalquantity_overall)])

 g=ggplot(final_quantity,aes(y=finalquantity_overall,x=Country_areas,label=Country_areas))
 g+geom_bar(stat='identity', aes(fill=finalquantity_overall), width=.5)+
   coord_flip()

Europe has the highest overall value.

final_value$Country_areas <- factor(final_value$Country_areas, levels = final_value$Country_areas[order(final_value$finalvalue_overall)])

 g=ggplot(final_value,aes(y=finalvalue_overall,x=Country_areas,label=Country_areas))
 g+geom_bar(stat='identity', aes(fill=finalvalue_overall), width=.5)+
   coord_flip()

2. Which country has the highest unit value (Value / Quantity)

Brazil has the hightest unit price while Singapore has the lowest unit price.

unit_value=data.frame(final_value$Country_areas,final_quantity$finalquantity_overall,final_value$finalvalue_overall)
names(unit_value) = c('country','quantity','value')
unit_value=unit_value%>%
  mutate(unit_value = quantity /value )
  
unit_value$country <- factor(unit_value$country, levels = unit_value$country[order(unit_value$unit_value)])

g1=ggplot(unit_value,aes(y=unit_value,x=country,label=country))
 g1+geom_bar(stat='identity', aes(fill=unit_value), width=.5)+
   coord_flip()

## Warning: Removed 1 rows containing missing values (position_stack).

#####3. What does unit value distribution looks like?
The unit value distribution is a normal distribution with slightly left skewed.
The average unit value is 0.9373.

ggplot(unit_value, aes(x=unit_value)) + 
  geom_histogram(binwidth = 0.1, color="darkblue", fill="lightblue")+
  geom_density(alpha=.2, fill="#FF6666")

## Warning: Removed 1 rows containing non-finite values (stat_bin).

## Warning: Removed 1 rows containing non-finite values (stat_density).

summary(unit_value)

##            country      quantity           value           unit_value    
##  Singapore     : 1   Min.   :      0   Min.   :      0   Min.   :0.2360  
##  Colombia      : 1   1st Qu.:  27938   1st Qu.:  31471   1st Qu.:0.8046  
##  Hong Kong     : 1   Median :  86679   Median : 104412   Median :0.9368  
##  Norway        : 1   Mean   : 377578   Mean   : 361586   Mean   :0.9373  
##  Poland        : 1   3rd Qu.: 515234   3rd Qu.: 407678   3rd Qu.:1.0529  
##  United Kingdom: 1   Max.   :2300376   Max.   :2228125   Max.   :1.4819  
##  (Other)       :41                                       NA's   :1

What is the defferent between preliminary and final quantity

Most quantity difference falls in 0-100 bin. This is right skewed distribution.

diff  = steel_final_1%>%
  mutate(quantity_diff = finalquantity - prequantity,
         value_diff = finalvalue - prevalue)%>%
  filter(!is.na(quantity_diff))

ggplot(diff, aes(x=quantity_diff)) +
  geom_histogram(binwidth = 100, color="darkblue", fill="lightblue")+
  geom_density(alpha=.2, fill="#FF6666")

Data607_project2_weizhou

Wei Zhou

3/10/2019