Part 1: Analysis of Fish Data (35 total points)

 

1.1 Import the data

fish <- read_table2("http://jse.amstat.org/datasets/fishcatch.dat.txt",
                     col_names = c("obs", "species", "weight", "len1", "len2", "len3", "height.pct", "width.pct", "sex"))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   obs = col_double(),
##   species = col_double(),
##   weight = col_double(),
##   len1 = col_double(),
##   len2 = col_double(),
##   len3 = col_double(),
##   height.pct = col_double(),
##   width.pct = col_double(),
##   sex = col_double()
## )

 

1.2 Change ‘sex’ to be a factor and rename

# fish$sex <- c("0", "1")
fish$sex <- factor(fish$sex)
fish$sex <- fct_recode(fish$sex,
                  "male" = "0",
                  "female" = "1")

 

1.3 Change ‘species’ to be a factor and rename

# species <- c("1", "2", "3", "4", "5", "6", "7")
fish$species <- factor(fish$species)
fish$species <- fct_recode(fish$species,
                      "Common Bream" = "1",
                      "Whitefish" = "2",
                      "Roach" = "3",
                      "Silver Bream" = "4",
                      "Smelt" = "5",
                      "Pike" = "6",
                      "Perch" = "7")
# Do not modify the following code:
fish.sub <- filter(fish, !is.na(sex)) # only display rows where sex is not NA
knitr::kable(head(fish.sub), format = "markdown")
obs species weight len1 len2 len3 height.pct width.pct sex
14 Common Bream NA 29.5 32 37.3 37.3 13.6 female
15 Common Bream 600 29.4 32 37.2 40.2 13.9 female
17 Common Bream 700 30.4 33 38.3 38.8 13.8 female
21 Common Bream 575 31.3 34 39.5 38.3 14.1 female
26 Common Bream 725 31.8 35 40.9 40.0 14.8 female
30 Common Bream 1000 33.5 37 42.6 44.5 15.5 male

 

1.4 Determine mean weight for each species

mean.wt <-
  fish %>%
group_by(species) %>%
  dplyr::summarize(Mean = mean(weight, na.rm=TRUE)) %>% 
  arrange(Mean)
# Do not modify the following code:
knitr::kable(mean.wt, format = "markdown")
species Mean
Smelt 11.17857
Roach 152.05000
Silver Bream 154.81818
Perch 382.23929
Whitefish 531.00000
Common Bream 626.00000
Pike 718.70588

The species with the smallest mean weight is Smelt with a weight of 11.18 lbs.

 

1.5 Plot the mean weights for each species

ggplot(mean.wt, aes(x=species, y=Mean)) + geom_bar(stat="identity") + ggtitle("Mean Weight per Species") + labs(y="Weight", x = "Species")

 

Part 2: Analysis of Forbes Global 2000 Data (50 total points)

 

2.1 Import the dataset

Forbes <- read_csv("/Users/oliviacochran/Desktop/Grad School/Spring 2021/Strategy Analytics II/R Exercise/2019_Forbes_Global_2000.csv") 
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Rank = col_double(),
##   Company = col_character(),
##   Sector = col_character(),
##   Industry = col_character(),
##   Continent = col_character(),
##   Country = col_character(),
##   Revenue = col_double(),
##   Profits = col_double(),
##   Assets = col_double(),
##   Market_Value = col_double()
## )

 

2.2 Exclude specified records

Forbes <- 
  Forbes %>%
  filter(!is.na(Sector)) %>%
  filter(Revenue >= 0)

 

2.3 Convert four variables to factors

Sector <- c("Consumer Discretionary", "Consumer Staples", "Energy", "Financials", "Health Care", "Industrials", "Information Technology", "Materials", "Telecommunication Services", "Utilities")
Sector <- factor(Sector)
Sector
##  [1] Consumer Discretionary     Consumer Staples          
##  [3] Energy                     Financials                
##  [5] Health Care                Industrials               
##  [7] Information Technology     Materials                 
##  [9] Telecommunication Services Utilities                 
## 10 Levels: Consumer Discretionary Consumer Staples Energy ... Utilities
Industry <- c("Advertising", "Aerospace & Defense", "Air Courier", "Airline", "Aluminum", "Apparel/Accessories", "Apparel/Footwear Retail", "Auto & Truck Manufacturers", "Auto & Truck Parts", "Beverages", "Biotechs", "Broadcasting & Cable", "Business & Personal Service", "Business Products & Supplies", "Casinos & Gaming", "Communications Equipment", "Computer & Electronics Retail", "Computer Hardware", "Computer Service", "Computer Storage Devices", "Conglomerates", "Construction Materials", "Construction Services", "Consumer Electronics","Consumer Financial Services", "Containers & Packaging", "Department Stores", "Discount Stores", "Diversified Chemicals","Diversified Insurance","Diversified Metals & Mining", "Diversified Utilities", "Drug Retail", "Electric Utilities", "Electrical Equipment", "Electronics", "Environmental & Waste", "Food Processing", "Food Retail", "Forest Products", "Furniture & Fixtures", "Healthcare Services", "Heavy Equipment","Home Improvement & Retail","Hotels & Motels", "Household Appliances", "Household/Personal Care", "Insurance Brokers", "Internet & Catalog Retail", "Investment Services", "Iron & Steel", "Life & Health Insurance", "Major Banks", "Managed Health Care", "Medical Equipment & Supplies", "Natural Gas Utilities", "Oil & Gas Operations", "Oil Services & Equipment", "Other Industrial Equipment", "Other Transportation", "Paper & Paper Products", "Pharmaceuticals", "Printing & Publishing", "Property & Casualty Insurance", "Railroads", "Real Estate", "Recreational Products", "Regional Banks", "Rental & Leasing", "Restaurants","Security Systems","Semiconductors","Software & Programming", "Specialized Chemicals", "Specialty Stores","Telecommunication services","Thrifts & Mortgage Finance","Tobacco","Trading Companies","Trucking")
Industry <- factor(Industry)
Industry
##  [1] Advertising                   Aerospace & Defense          
##  [3] Air Courier                   Airline                      
##  [5] Aluminum                      Apparel/Accessories          
##  [7] Apparel/Footwear Retail       Auto & Truck Manufacturers   
##  [9] Auto & Truck Parts            Beverages                    
## [11] Biotechs                      Broadcasting & Cable         
## [13] Business & Personal Service   Business Products & Supplies 
## [15] Casinos & Gaming              Communications Equipment     
## [17] Computer & Electronics Retail Computer Hardware            
## [19] Computer Service              Computer Storage Devices     
## [21] Conglomerates                 Construction Materials       
## [23] Construction Services         Consumer Electronics         
## [25] Consumer Financial Services   Containers & Packaging       
## [27] Department Stores             Discount Stores              
## [29] Diversified Chemicals         Diversified Insurance        
## [31] Diversified Metals & Mining   Diversified Utilities        
## [33] Drug Retail                   Electric Utilities           
## [35] Electrical Equipment          Electronics                  
## [37] Environmental & Waste         Food Processing              
## [39] Food Retail                   Forest Products              
## [41] Furniture & Fixtures          Healthcare Services          
## [43] Heavy Equipment               Home Improvement & Retail    
## [45] Hotels & Motels               Household Appliances         
## [47] Household/Personal Care       Insurance Brokers            
## [49] Internet & Catalog Retail     Investment Services          
## [51] Iron & Steel                  Life & Health Insurance      
## [53] Major Banks                   Managed Health Care          
## [55] Medical Equipment & Supplies  Natural Gas Utilities        
## [57] Oil & Gas Operations          Oil Services & Equipment     
## [59] Other Industrial Equipment    Other Transportation         
## [61] Paper & Paper Products        Pharmaceuticals              
## [63] Printing & Publishing         Property & Casualty Insurance
## [65] Railroads                     Real Estate                  
## [67] Recreational Products         Regional Banks               
## [69] Rental & Leasing              Restaurants                  
## [71] Security Systems              Semiconductors               
## [73] Software & Programming        Specialized Chemicals        
## [75] Specialty Stores              Telecommunication services   
## [77] Thrifts & Mortgage Finance    Tobacco                      
## [79] Trading Companies             Trucking                     
## 80 Levels: Advertising Aerospace & Defense Air Courier Airline ... Trucking
Continent <- c("Africa", "Asia","Australia","Europe","North America","South America")
Continent <- factor(Continent)
Continent
## [1] Africa        Asia          Australia     Europe        North America
## [6] South America
## Levels: Africa Asia Australia Europe North America South America
Country <- c("Argentina","Australia","Austria","Bahrain","Belgium","Bermuda","Brazil","Canada","Chile","China","Colombia","Czech Republic","Denmark","Egypt","Finland","France","Germany","Greece","Hong Kong","Hungary","India","Indonesia","Ireland","Israel","Italy","Japan","Jordan","Kazakhstan","Kenya","Kuwait","Lebanon","Luxembourg","Malaysia","Mexico","Monaco","Morocco","Netherlands","Nigeria", "Norway", "Oman", "Peru", "Philippines", "Poland", "Portugal","Qatar","Russia","Saudi Arabia","Singapore","South Africa","South Korea","Spain","Sweden","Switzerland","Taiwan","Thailand","Turkey","United Arab Emirates","United Kingdom","United States","Vietnam")
Country <- factor(Country)
Country
##  [1] Argentina            Australia            Austria             
##  [4] Bahrain              Belgium              Bermuda             
##  [7] Brazil               Canada               Chile               
## [10] China                Colombia             Czech Republic      
## [13] Denmark              Egypt                Finland             
## [16] France               Germany              Greece              
## [19] Hong Kong            Hungary              India               
## [22] Indonesia            Ireland              Israel              
## [25] Italy                Japan                Jordan              
## [28] Kazakhstan           Kenya                Kuwait              
## [31] Lebanon              Luxembourg           Malaysia            
## [34] Mexico               Monaco               Morocco             
## [37] Netherlands          Nigeria              Norway              
## [40] Oman                 Peru                 Philippines         
## [43] Poland               Portugal             Qatar               
## [46] Russia               Saudi Arabia         Singapore           
## [49] South Africa         South Korea          Spain               
## [52] Sweden               Switzerland          Taiwan              
## [55] Thailand             Turkey               United Arab Emirates
## [58] United Kingdom       United States        Vietnam             
## 60 Levels: Argentina Australia Austria Bahrain Belgium Bermuda ... Vietnam

 

2.4 Create a scatterplot of Market Value by Sales

MVS <-
   subset(Forbes, Continent %in% c("Asia","Europe","North America"))
ggplot(MVS, aes(Revenue, Market_Value)) + geom_point() + geom_smooth() + facet_grid(~Continent)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Based on the graphs, Asia, Europe, and North America all have a positive relationship between revenue and market value until revenue reaches about 250. At that point, Asia begins to have a negative relationship between the two variables (as revenue increases, market value decreases), while Europe stays positive. For North America, the relationship between revenue and market value stays positive until revenue reaches about 350 and then sharply declines. North America has the steepest slope out of the three, which is impacted by the outliers in the dataset. Compared to Asia and Europe, North America has not only the most outliers, but also the most extreme.

 

2.5 Create Profit Margin variable

Forbes <- mutate(Forbes, ProfMgn = Profits / Revenue)
filter(Forbes, ProfMgn < 5)

A tibble: 1,943 x 11

Rank Company Sector Industry Continent Country Revenue Profits Assets

1 1 ICBC Finan… Major B… Asia China 176. 45.2 4034. 2 2 JPMorg… Finan… Major B… North Am… United… 133. 32.7 2737. 3 3 China … Finan… Major B… Asia China 150. 38.8 3382. 4 4 Agricu… Finan… Regiona… Asia China 137. 30.9 3293. 5 5 Bank o… Finan… Major B… North Am… United… 112. 28.5 2377. 6 6 Apple Infor… Compute… North Am… United… 262. 59.4 374. 7 7 Ping A… Finan… Diversi… Asia China 152. 16.3 1038. 8 8 Bank o… Finan… Major B… Asia China 127. 27.5 3098. 9 9 Royal … Energy Oil & G… Europe Nether… 383. 23.3 399. 10 10 Wells … Finan… Major B… North Am… United… 101. 23.1 1888. # … with 1,933 more rows, and 2 more variables: Market_Value , # ProfMgn

# Do not modify the following code:
knitr::kable(head(Forbes), format = "markdown")
Rank Company Sector Industry Continent Country Revenue Profits Assets Market_Value ProfMgn
1 ICBC Financials Major Banks Asia China 175.874 45.223 4034.482 305.057 0.2571329
2 JPMorgan Chase Financials Major Banks North America United States 132.912 32.738 2737.188 368.502 0.2463134
3 China Construction Bank Financials Major Banks Asia China 150.313 38.841 3382.422 224.988 0.2584008
4 Agricultural Bank of China Financials Regional Banks Asia China 137.456 30.894 3293.105 197.045 0.2247556
5 Bank of America Financials Major Banks North America United States 111.904 28.540 2377.164 287.339 0.2550400
6 Apple Information Technology Computer Hardware North America United States 261.705 59.431 373.719 961.257 0.2270916

 

2.6 Create boxplot of Profit Margin by Sector

ggplot(Forbes, aes(ProfMgn, Sector)) + geom_boxplot()
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

The sector that appears to have the greatest standard deviation is Financials.

 

2.7 Calculate the SD for each sector

Forbes.SD <- group_by(Forbes, Sector)
Forbes.SD <- summarise(Forbes.SD, stnd_dev = sd(ProfMgn, na.rm= TRUE))
Forbes.SD <- arrange(Forbes.SD, desc(stnd_dev))
# Do not modify the following code:
knitr::kable(Forbes.SD, format = "markdown")
Sector stnd_dev
Financials 7.5349304
Consumer Discretionary 2.0406190
Utilities 0.4912375
Telecommunication Services 0.4698143
Information Technology 0.2303487
Industrials 0.1611717
Materials 0.1566098
Health Care 0.1488790
Consumer Staples 0.1282274
Energy 0.1089204

The sector that has the greatest standard deviation is Financials, as was estimated, with a standard deviation of 7.53. Based on the previous graph, Financials has one large outlier, which could describe why the standard deviation is so much larger for Financials than any of the other sectors in this dataset. Consumer Discretionary has the second highest standard deviation, with 2.04. The graph also showed that Cosnumer Discretionary has an outlier, although it’s not as large as the outlier for Financials. The remaining sectors all have a standard deviation less than 1, which indicates that datapoints are clustered around the mean of each sector.