Loading necessary libraries.

library(knitr)
library(stringr)
library(ggplot2)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Tidying Three Datasets

The goal of this assignment is to choose three “wide” datasets from the Week 5 Discussion Board so as to tidy and transform them. The datasets used below are the ChronicDiseases, ChildMortality and Shipping datasets. For each dataset, a question will be asked, and attempted to be answered.

Chronic Diseases Dataset

What is the most prevalent disease in each state?

First, we read in the Chronic Diseases dataset.

## Chronic Diseases Dataset
CD = read.csv("ChronicDisease.csv", stringsAsFactors = T)
dim(CD)
## [1] 237961     34
head(CD)
##     X0 YearEnd LocationAbbr         LocationDesc DataSource   Topic
## 1 2013    2013           CA           California      YRBSS Alcohol
## 2 2013    2013           CO             Colorado      YRBSS Alcohol
## 3 2013    2013           CT          Connecticut      YRBSS Alcohol
## 4 2013    2013           DC District of Columbia      YRBSS Alcohol
## 5 2013    2013           DE             Delaware      YRBSS Alcohol
## 6 2013    2013           FL              Florida      YRBSS Alcohol
##                  Question Response DataValueUnit DataValueTypeID
## 1 Alcohol use among youth       NA             %         CrdPrev
## 2 Alcohol use among youth       NA             %         CrdPrev
## 3 Alcohol use among youth       NA             %         CrdPrev
## 4 Alcohol use among youth       NA             %         CrdPrev
## 5 Alcohol use among youth       NA             %         CrdPrev
## 6 Alcohol use among youth       NA             %         CrdPrev
##      DataValueType DataValue DataValueAlt DataValueFootnoteSymbol
## 1 Crude Prevalence                     NA                       -
## 2 Crude Prevalence                     NA                       -
## 3 Crude Prevalence      36.7         36.7                        
## 4 Crude Prevalence      31.4         31.4                        
## 5 Crude Prevalence      36.3         36.3                        
## 6 Crude Prevalence      34.8         34.8                        
##   DatavalueFootnote LowConfidenceLimit HighConfidenceLimit
## 1 No data available                 NA                  NA
## 2 No data available                 NA                  NA
## 3                                 32.7                41.0
## 4                                 30.2                32.5
## 5                                 33.7                39.0
## 6                                 33.1                36.6
##   StratificationCategory1 Stratification1 StratificationCategory2
## 1                 Overall         Overall                      NA
## 2                 Overall         Overall                      NA
## 3                 Overall         Overall                      NA
## 4                 Overall         Overall                      NA
## 5                 Overall         Overall                      NA
## 6                 Overall         Overall                      NA
##   Stratification2 StratificationCategory3 Stratification3
## 1              NA                      NA              NA
## 2              NA                      NA              NA
## 3              NA                      NA              NA
## 4              NA                      NA              NA
## 5              NA                      NA              NA
## 6              NA                      NA              NA
##                                 GeoLocation TopicID QuestionID ResponseID
## 1  (37.63864012300047, -120.99999953799971)     ALC     ALC1_1         NA
## 2 (38.843840757000464, -106.13361092099967)     ALC     ALC1_1         NA
## 3   (41.56266102000046, -72.64984095199964)     ALC     ALC1_1         NA
## 4                   (38.907192, -77.036871)     ALC     ALC1_1         NA
## 5  (39.008830667000495, -75.57774116799965)     ALC     ALC1_1         NA
## 6  (28.932040377000476, -81.92896053899966)     ALC     ALC1_1         NA
##   LocationID StratificationCategoryID1 StratificationID1
## 1          6                   OVERALL               OVR
## 2          8                   OVERALL               OVR
## 3          9                   OVERALL               OVR
## 4         11                   OVERALL               OVR
## 5         10                   OVERALL               OVR
## 6         12                   OVERALL               OVR
##   StratificationCategoryID2 StratificationID2 StratificationCategoryID3
## 1                        NA                NA                        NA
## 2                        NA                NA                        NA
## 3                        NA                NA                        NA
## 4                        NA                NA                        NA
## 5                        NA                NA                        NA
## 6                        NA                NA                        NA
##   StratificationID3
## 1                NA
## 2                NA
## 3                NA
## 4                NA
## 5                NA
## 6                NA

After viewing the dataset and determining what factors would be most helpful to analysing the question, some columns were removed. Those columns were then renamed for clarity.

# Remove columns we won't be using
CD = CD[,c(1:6,10,13,18, 19)]
# Change column names

colnames(CD) = c( "Year0", "YearEnd", "StateAbbr", "State", "DataSource", "Topic", "DataType", "DataValue", "StratCategory", "Stratification")
head(CD)
##   Year0 YearEnd StateAbbr                State DataSource   Topic DataType
## 1  2013    2013        CA           California      YRBSS Alcohol  CrdPrev
## 2  2013    2013        CO             Colorado      YRBSS Alcohol  CrdPrev
## 3  2013    2013        CT          Connecticut      YRBSS Alcohol  CrdPrev
## 4  2013    2013        DC District of Columbia      YRBSS Alcohol  CrdPrev
## 5  2013    2013        DE             Delaware      YRBSS Alcohol  CrdPrev
## 6  2013    2013        FL              Florida      YRBSS Alcohol  CrdPrev
##   DataValue StratCategory Stratification
## 1        NA       Overall        Overall
## 2        NA       Overall        Overall
## 3      36.7       Overall        Overall
## 4      31.4       Overall        Overall
## 5      36.3       Overall        Overall
## 6      34.8       Overall        Overall

Some of the data in the Topic column is not uniformally spelled. We rename the data and then remove NA values in the DataValue column for easier computations.

# Correct errors within data
CD$Topic = str_replace_all(CD$Topic, "Cardiovascular disease", "Cardiovascular Disease")
CD$Topic = str_replace_all(CD$Topic, "Chronic kidney disease", "Chronic Kidney Disease")

# Removing NA values
newCD = filter(CD, DataValue != "NA")
head(newCD)
##   Year0 YearEnd StateAbbr                State DataSource   Topic DataType
## 1  2013    2013        CT          Connecticut      YRBSS Alcohol  CrdPrev
## 2  2013    2013        DC District of Columbia      YRBSS Alcohol  CrdPrev
## 3  2013    2013        DE             Delaware      YRBSS Alcohol  CrdPrev
## 4  2013    2013        FL              Florida      YRBSS Alcohol  CrdPrev
## 5  2013    2013        GA              Georgia      YRBSS Alcohol  CrdPrev
## 6  2013    2013        GU                 Guam      YRBSS Alcohol  CrdPrev
##   DataValue StratCategory Stratification
## 1      36.7       Overall        Overall
## 2      31.4       Overall        Overall
## 3      36.3       Overall        Overall
## 4      34.8       Overall        Overall
## 5      27.9       Overall        Overall
## 6      23.5       Overall        Overall

From here, we tidy the data by first adding a row_number indicator to bypass any issues with replicated rows. From there, we spread the StratCategory column so that Gender and Race/Ethnicity can each be a column.

# Break Topic into several columns
newCD =  group_by(newCD, row_number())
newCD = spread(newCD, StratCategory, Stratification)
kable(head(newCD))
Year0 YearEnd StateAbbr State DataSource Topic DataType DataValue row_number() Gender Overall Race/Ethnicity
2013 2013 CT Connecticut YRBSS Alcohol CrdPrev 36.7 1 NA Overall NA
2013 2013 DC District of Columbia YRBSS Alcohol CrdPrev 31.4 2 NA Overall NA
2013 2013 DE Delaware YRBSS Alcohol CrdPrev 36.3 3 NA Overall NA
2013 2013 FL Florida YRBSS Alcohol CrdPrev 34.8 4 NA Overall NA
2013 2013 GA Georgia YRBSS Alcohol CrdPrev 27.9 5 NA Overall NA
2013 2013 GU Guam YRBSS Alcohol CrdPrev 23.5 6 NA Overall NA

For finding the most prevalent illnesses in each state, first we group the data by State and illness (Topic), summarize the mean and then retract from each state the Topic with the highest mean.

# Finding the mean number of incidents for each Topic by State
stats = data.frame(newCD %>% group_by(State, Topic) %>%
          summarise(MeanState = round(mean(DataValue)),2))

# Finding the most prevalent issue in each state
kable(stats %>% group_by(State) %>% top_n(1, MeanState))
State Topic MeanState X2
Alabama Cardiovascular Disease 1127 2
Alaska Overarching Conditions 192 2
Arizona Chronic Obstructive Pulmonary Disease 1761 2
Arkansas Chronic Obstructive Pulmonary Disease 1083 2
California Cardiovascular Disease 4708 2
Colorado Diabetes 903 2
Connecticut Cardiovascular Disease 657 2
Delaware Cardiovascular Disease 258 2
District of Columbia Overarching Conditions 218 2
Florida Chronic Obstructive Pulmonary Disease 6728 2
Georgia Cardiovascular Disease 1323 2
Guam Cancer 60 2
Hawaii Cardiovascular Disease 285 2
Idaho Cardiovascular Disease 337 2
Illinois Cardiovascular Disease 2079 2
Indiana Cardiovascular Disease 1278 2
Iowa Chronic Obstructive Pulmonary Disease 1415 2
Kansas Cardiovascular Disease 512 2
Kentucky Chronic Obstructive Pulmonary Disease 3144 2
Louisiana Cardiovascular Disease 897 2
Maine Cardiovascular Disease 429 2
Maryland Chronic Obstructive Pulmonary Disease 1208 2
Massachusetts Chronic Obstructive Pulmonary Disease 1301 2
Michigan Diabetes 3082 2
Minnesota Cardiovascular Disease 684 2
Mississippi Cardiovascular Disease 801 2
Missouri Cardiovascular Disease 1207 2
Montana Cardiovascular Disease 286 2
Nebraska Chronic Obstructive Pulmonary Disease 631 2
Nevada Chronic Obstructive Pulmonary Disease 825 2
New Hampshire Cardiovascular Disease 378 2
New Jersey Chronic Obstructive Pulmonary Disease 2397 2
New Mexico Diabetes 447 2
New York Diabetes 5323 2
North Carolina Chronic Obstructive Pulmonary Disease 3396 2
North Dakota Cardiovascular Disease 251 2
Ohio Cardiovascular Disease 2213 2
Oklahoma Cardiovascular Disease 806 2
Oregon Diabetes 908 2
Pennsylvania Cardiovascular Disease 2583 2
Puerto Rico Reproductive Health 72 2
Rhode Island Chronic Obstructive Pulmonary Disease 391 2
South Carolina Chronic Obstructive Pulmonary Disease 2089 2
South Dakota Diabetes 319 2
Tennessee Cardiovascular Disease 1388 2
Texas Cardiovascular Disease 3026 2
United States Chronic Obstructive Pulmonary Disease 72820 2
Utah Cardiovascular Disease 341 2
Vermont Chronic Obstructive Pulmonary Disease 322 2
Virgin Islands Older Adults 49 2
Virginia Cardiovascular Disease 1146 2
Washington Diabetes 1513 2
West Virginia Chronic Obstructive Pulmonary Disease 1119 2
Wisconsin Chronic Obstructive Pulmonary Disease 1807 2
Wyoming Overarching Conditions 208 2

Child Mortality Dataset

Have mortalities for children ages five and below increased or decreased in the past 60 years?

Read in the Child Mortality csv file and remove the data columns that does not include strictly children under the age of five.

## Child Mortality Dataset
CM = read.csv("ChildMortality.csv", skip = 6)
CM = CM[,1:69]
head(CM)
##   ISO.Code CountryName Uncertainty.bounds. U5MR.1950 U5MR.1951 U5MR.1952
## 1      AFG Afghanistan               Lower        NA        NA        NA
## 2      AFG Afghanistan              Median        NA        NA        NA
## 3      AFG Afghanistan               Upper        NA        NA        NA
## 4      AGO      Angola               Lower        NA        NA        NA
## 5      AGO      Angola              Median        NA        NA        NA
## 6      AGO      Angola               Upper        NA        NA        NA
##   U5MR.1953 U5MR.1954 U5MR.1955 U5MR.1956 U5MR.1957 U5MR.1958 U5MR.1959
## 1        NA        NA        NA        NA        NA        NA        NA
## 2        NA        NA        NA        NA        NA        NA        NA
## 3        NA        NA        NA        NA        NA        NA        NA
## 4        NA        NA        NA        NA        NA        NA        NA
## 5        NA        NA        NA        NA        NA        NA        NA
## 6        NA        NA        NA        NA        NA        NA        NA
##   U5MR.1960 U5MR.1961 U5MR.1962 U5MR.1963 U5MR.1964 U5MR.1965 U5MR.1966
## 1        NA     306.9     305.5     303.1     300.0     295.3     290.6
## 2        NA     356.5     350.6     345.0     339.7     334.1     328.7
## 3        NA     413.2     402.7     393.5     385.5     378.1     371.7
## 4        NA        NA        NA        NA        NA        NA        NA
## 5        NA        NA        NA        NA        NA        NA        NA
## 6        NA        NA        NA        NA        NA        NA        NA
##   U5MR.1967 U5MR.1968 U5MR.1969 U5MR.1970 U5MR.1971 U5MR.1972 U5MR.1973
## 1     285.7     280.7     275.9     270.9     265.8     260.9     256.1
## 2     323.3     318.1     313.0     307.8     302.1     296.4     290.8
## 3     366.0     360.8     355.8     351.4     346.0     340.4     334.4
## 4        NA        NA        NA        NA        NA        NA        NA
## 5        NA        NA        NA        NA        NA        NA        NA
## 6        NA        NA        NA        NA        NA        NA        NA
##   U5MR.1974 U5MR.1975 U5MR.1976 U5MR.1977 U5MR.1978 U5MR.1979 U5MR.1980
## 1     251.6     246.6     242.0     237.2     231.7     226.6     221.3
## 2     284.9     279.4     273.6     267.8     261.6     255.5     249.1
## 3     328.1     321.2     313.9     306.3     298.8     290.8     283.8
## 4        NA        NA        NA        NA        NA        NA     187.4
## 5        NA        NA        NA        NA        NA        NA     234.1
## 6        NA        NA        NA        NA        NA        NA     290.2
##   U5MR.1981 U5MR.1982 U5MR.1983 U5MR.1984 U5MR.1985 U5MR.1986 U5MR.1987
## 1     215.3     209.8     204.1     198.5     192.7     186.8     181.0
## 2     242.7     236.2     229.7     222.9     216.0     209.2     202.1
## 3     276.7     269.6     262.1     254.3     246.4     237.7     229.1
## 4     190.0     192.7     194.3     196.0     197.4     198.4     199.8
## 5     232.8     231.5     230.2     229.1     228.3     227.5     226.9
## 6     282.9     276.8     271.5     267.4     264.0     260.9     258.6
##   U5MR.1988 U5MR.1989 U5MR.1990 U5MR.1991 U5MR.1992 U5MR.1993 U5MR.1994
## 1     175.0     169.2     163.1     157.2     151.7     146.5     142.0
## 2     195.0     187.8     181.0     174.2     167.8     162.0     156.8
## 3     220.3     210.7     201.7     192.8     185.3     178.5     172.4
## 4     200.6     201.1     201.6     201.9     202.3     202.1     201.9
## 5     226.5     226.2     226.0     225.9     226.0     225.8     225.5
## 6     257.3     256.1     255.3     254.4     253.6     253.1     252.8
##   U5MR.1995 U5MR.1996 U5MR.1997 U5MR.1998 U5MR.1999 U5MR.2000 U5MR.2001
## 1     138.3     135.6     133.1     130.7     128.3     125.6     122.6
## 2     152.3     148.6     145.5     142.6     139.9     137.0     133.8
## 3     167.2     162.7     159.0     155.9     153.1     150.1     146.8
## 4     201.1     199.9     198.2     195.9     193.3     190.4     186.9
## 5     224.8     224.0     222.6     220.8     218.9     216.7     214.1
## 6     252.5     251.7     250.4     249.2     247.9     246.6     245.4
##   U5MR.2002 U5MR.2003 U5MR.2004 U5MR.2005 U5MR.2006 U5MR.2007 U5MR.2008
## 1     119.4     115.9     112.5     109.0     105.6     102.2      98.9
## 2     130.3     126.8     123.2     119.6     116.3     113.2     110.4
## 3     143.1     139.6     136.1     132.6     129.6     126.7     124.2
## 4     182.5     178.3     172.7     166.4     159.9     153.2     145.7
## 5     211.7     209.2     206.7     203.9     200.5     196.4     192.0
## 6     244.7     244.8     245.1     245.4     245.2     245.6     245.8
##   U5MR.2009 U5MR.2010 U5MR.2011 U5MR.2012 U5MR.2013 U5MR.2014 U5MR.2015
## 1      95.4      92.0      88.1      83.8      79.1      74.5      69.6
## 2     107.6     105.0     102.3      99.5      96.7      93.9      91.1
## 3     122.2     120.6     119.3     118.5     118.2     118.3     118.8
## 4     137.8     130.1     122.1     114.9     107.7     101.1      94.8
## 5     187.3     182.5     177.3     172.2     167.1     162.2     156.9
## 6     246.8     248.3     249.5     250.8     252.2     253.6     253.7

Because each year is a different column, here we combine all the years into one. Then, we spread the Uncertainty.bounds column into three (Lower, Median, Upper). Finally, we remove the NA values for easier computation. It should be noted that NA values show consistently across the rows meaning that if the Lower column has an NA value, so will the Median and Upper columns. This applies to the Median and Upper columns as well. Also, because the U5MR_Year data were previously columns, they do not show only the year. As such, we will remove the “U5MR” characters that precede the year.

# Creating a column with all the data of Mortality Rates of children under the age of 5
newCM = gather(CM, "U5MR_Year", "U5MR_Rate", 4:69)
kable(head(newCM))
ISO.Code CountryName Uncertainty.bounds. U5MR_Year U5MR_Rate
AFG Afghanistan Lower U5MR.1950 NA
AFG Afghanistan Median U5MR.1950 NA
AFG Afghanistan Upper U5MR.1950 NA
AGO Angola Lower U5MR.1950 NA
AGO Angola Median U5MR.1950 NA
AGO Angola Upper U5MR.1950 NA
# Breaking the Uncertainty.bounds column into Lower, Middle and Upper
newCM = spread(newCM, "Uncertainty.bounds.", "U5MR_Rate")
kable(head(newCM))
ISO.Code CountryName U5MR_Year Lower Median Upper
AFG Afghanistan U5MR.1950 NA NA NA
AFG Afghanistan U5MR.1951 NA NA NA
AFG Afghanistan U5MR.1952 NA NA NA
AFG Afghanistan U5MR.1953 NA NA NA
AFG Afghanistan U5MR.1954 NA NA NA
AFG Afghanistan U5MR.1955 NA NA NA
# Removing NA values
newCM = filter(newCM, Lower != "NA")
kable(head(newCM))
ISO.Code CountryName U5MR_Year Lower Median Upper
AFG Afghanistan U5MR.1961 306.9 356.5 413.2
AFG Afghanistan U5MR.1962 305.5 350.6 402.7
AFG Afghanistan U5MR.1963 303.1 345.0 393.5
AFG Afghanistan U5MR.1964 300.0 339.7 385.5
AFG Afghanistan U5MR.1965 295.3 334.1 378.1
AFG Afghanistan U5MR.1966 290.6 328.7 371.7
# Changing the Year data into only showing the year
newCM$U5MR_Year = as.numeric(substr(newCM$U5MR_Year, 6, 10))
kable(head(newCM))
ISO.Code CountryName U5MR_Year Lower Median Upper
AFG Afghanistan 1961 306.9 356.5 413.2
AFG Afghanistan 1962 305.5 350.6 402.7
AFG Afghanistan 1963 303.1 345.0 393.5
AFG Afghanistan 1964 300.0 339.7 385.5
AFG Afghanistan 1965 295.3 334.1 378.1
AFG Afghanistan 1966 290.6 328.7 371.7

Because the data is too large to view in table format, most conclusions we can draw about the increase or decrease of child mortality over the years are from plots. The data has been grouped first by country and then by year. The plot by year shows a steep decline in overall child mortality since the 1960s.

# Grouped by Country
newCM %>% group_by(ISO.Code) %>% 
          summarise(MeanLower = round(mean(Lower),2),
                    MeanMedian = round(mean(Median),2),
                    MeanUpper = round(mean(Upper),2),
                    OverallMean= round((MeanLower+ MeanMedian+ MeanUpper)/3,2),
                    Counts = n())
## # A tibble: 195 × 6
##    ISO.Code MeanLower MeanMedian MeanUpper OverallMean Counts
##      <fctr>     <dbl>      <dbl>     <dbl>       <dbl>  <int>
## 1       AFG    183.77     207.22    235.32      208.77     55
## 2       AGO    175.01     210.84    254.72      213.52     36
## 3       ALB     29.62      36.82     48.55       38.33     38
## 4       AND      3.33       4.68      6.69        4.90     26
## 5       ARE     41.70      48.97     57.74       49.47     56
## 6       ARG     31.82      32.81     33.83       32.82     47
## 7       ARM     38.33      43.24     48.74       43.44     40
## 8       ATG     12.43      14.86     17.93       15.07     26
## 9       AUS     14.27      14.56     14.86       14.56     66
## 10      AUT     18.25      18.73     19.23       18.74     62
## # ... with 185 more rows
# Grouped by Year
newCM %>% group_by(U5MR_Year) %>% 
  summarise(MeanLower = round(mean(Lower),2),
            MeanMedian = round(mean(Median),2),
            MeanUpper = round(mean(Upper),2),
            OverallMean= round((MeanLower+ MeanMedian+ MeanUpper)/3,2),
            Counts = n())%>% 
ggplot(aes(x= U5MR_Year, y=OverallMean , colour = OverallMean)) + 
  geom_bar( aes(fill= OverallMean), stat="identity", position=position_dodge())

Shipping Dataset

Which Country is being most cost effective with its shipping fees (is collecting more than the carrier price)?

Read the Shipping Countries Dataset into R.

## Shipping Countries Dataset
SC = read.csv("ShippingCountries.csv")
head(SC)
##         Date                  Values  China Canada Tawian Singapore US
## 1 12/29/2016        Price of Carrier    518     NA     NA        NA NA
## 2            Shipping Fees Collected    260     NA     NA        NA NA
## 3 12/30/2016        Price of Carrier    232     NA     NA        NA NA
## 4            Shipping Fees Collected    132     NA     NA        NA NA
## 5   1/3/2017        Price of Carrier 1143.5     NA     NA        NA NA
## 6            Shipping Fees Collected    752     NA     NA        NA NA
##   England Korea Hong.Kong
## 1      NA    NA        NA
## 2      NA    NA        NA
## 3      NA    NA        NA
## 4      NA    NA        NA
## 5      NA    NA        NA
## 6      NA    NA        NA

In our attempt to make the Values column into two columns, we come across the issue of every second Date space being empty. To bypass this, we create a for loop that determines that if a Date row is empty, it will copy the date from the previous row onto it.

# For loop to fill in empty Dates spaces
for(i in 1:length(SC$Date)){
  if(SC$Date[i]== ""){
    SC$Date[i] = SC$Date[i-1]
  } else {
    SC$Date[i] = SC$Date[i]
  }
}
kable(head(SC))
Date Values China Canada Tawian Singapore US England Korea Hong.Kong
12/29/2016 Price of Carrier 518 NA NA NA NA NA NA NA
12/29/2016 Shipping Fees Collected 260 NA NA NA NA NA NA NA
12/30/2016 Price of Carrier 232 NA NA NA NA NA NA NA
12/30/2016 Shipping Fees Collected 132 NA NA NA NA NA NA NA
1/3/2017 Price of Carrier 1143.5 NA NA NA NA NA NA NA
1/3/2017 Shipping Fees Collected 752 NA NA NA NA NA NA NA

A column is created that will be a list of the countries. Then, because some of the numerical values are either blank or incorrectly entered characters, we will remove any non-numerical entries and fill both them and the empty ones with NA values. From there, we split the Values column into two columns (Price of Carrier and Shipping Fees) and then Break the Date column into Day, Month and Year columns. Furthermore, some spelling errors are corrected, such as renaming “Tawian” to “Taiwan” and removing the “.” from “Hong.Kong”. Finally, a new column called Difference is created which measures the difference between the Carrier Price and the Shipping Fees.

# Making a Country column
newSC = SC %>% gather("Country", "Fee", 3:10)
## Warning: attributes are not identical across measure variables; they will
## be dropped
kable(head(newSC))
Date Values Country Fee
12/29/2016 Price of Carrier China 518
12/29/2016 Shipping Fees Collected China 260
12/30/2016 Price of Carrier China 232
12/30/2016 Shipping Fees Collected China 132
1/3/2017 Price of Carrier China 1143.5
1/3/2017 Shipping Fees Collected China 752
# Removing any non-numerical characters and replacing blank entries with NA
newSC$Fee = as.numeric(as.character(newSC$Fee))
## Warning: NAs introduced by coercion
kable(head(newSC))
Date Values Country Fee
12/29/2016 Price of Carrier China 518.0
12/29/2016 Shipping Fees Collected China 260.0
12/30/2016 Price of Carrier China 232.0
12/30/2016 Shipping Fees Collected China 132.0
1/3/2017 Price of Carrier China 1143.5
1/3/2017 Shipping Fees Collected China 752.0
# Separating the Values column into two columns
newSC = spread(newSC, "Values", "Fee")
kable(head(newSC))
Date Country Price of Carrier Shipping Fees Collected
1/10/2017 Canada 279.5599 336
1/10/2017 China 318.0000 108
1/10/2017 England NA NA
1/10/2017 Hong.Kong 13.0000 20
1/10/2017 Korea NA NA
1/10/2017 Singapore NA NA
# Breaking Date column in Day, Month and Year
newSC = separate(newSC, Date, c("Month", "Day", "Year"), sep = "/")
kable(head(newSC))
Month Day Year Country Price of Carrier Shipping Fees Collected
1 10 2017 Canada 279.5599 336
1 10 2017 China 318.0000 108
1 10 2017 England NA NA
1 10 2017 Hong.Kong 13.0000 20
1 10 2017 Korea NA NA
1 10 2017 Singapore NA NA
# Removing the "." in Hong.Kong and correcting Tawian to Taiwan
newSC$Country = str_replace_all(newSC$Country, "[.]", " ")
newSC$Country = str_replace_all(newSC$Country, "Tawian", "Taiwan")
kable(head(newSC))
Month Day Year Country Price of Carrier Shipping Fees Collected
1 10 2017 Canada 279.5599 336
1 10 2017 China 318.0000 108
1 10 2017 England NA NA
1 10 2017 Hong Kong 13.0000 20
1 10 2017 Korea NA NA
1 10 2017 Singapore NA NA
# Create difference column for Shipping Fees-Price
newSC = mutate(newSC, Difference = (newSC[,6]-newSC[,5]))
kable(head(newSC))
Month Day Year Country Price of Carrier Shipping Fees Collected Difference
1 10 2017 Canada 279.5599 336 56.44006
1 10 2017 China 318.0000 108 -210.00000
1 10 2017 England NA NA NA
1 10 2017 Hong Kong 13.0000 20 7.00000
1 10 2017 Korea NA NA NA
1 10 2017 Singapore NA NA NA

By grouping the data by country, and plotting the mean Carrier Prices, mean Shipping Fees and the mean Differences between those two values, we see that Canada has the most cost effective approach and China has the least. However, it should be noted that this could be due to the lack of data on the part of the other countries since China had 16 more useable data entries than Canada (visible in the Counts column in the table below).

# Summary statistics
shipstats =newSC %>% group_by(Country) %>% 
           filter(Difference != "NA") %>% 
           summarise(MeanPrice = round(mean(`Price of Carrier`),2),
                     MeanShipping = round(mean(`Shipping Fees Collected`),2),
                     MeanDiff = round(mean(Difference),2),
                     PercentCollected = round((MeanShipping/MeanPrice)*100,2),
                     Counts = n())
kable(shipstats)
Country MeanPrice MeanShipping MeanDiff PercentCollected Counts
Canada 102.03 125.40 23.37 122.91 10
China 302.29 174.31 -127.98 57.66 26
England 60.75 81.00 20.25 133.33 2
Hong Kong 48.50 50.00 1.50 103.09 4
Korea 31.50 36.00 4.50 114.29 1
Singapore 60.36 48.00 -12.36 79.52 5
Taiwan 37.86 35.00 -2.86 92.45 7
US 62.25 62.55 0.30 100.48 7
gather(shipstats, "Means", "n", 2:3 ) %>% 
ggplot(aes(x= Means, y= n, colour = Means)) + 
  geom_bar( aes(fill= Means), stat="identity", position=position_dodge())+
  facet_grid(~ Country)

barplot(shipstats$MeanDiff, names.arg =shipstats$Country,  xlab = "Country", ylab = "MeanDifference")