Part 1: Analysis of Fish Data (35 total points)

1.1 Import the data

fish <- read_table2("http://jse.amstat.org/datasets/fishcatch.dat.txt",
                     col_names = c("obs", "species", "weight", "len1", "len2", "len3", "height.pct", "width.pct", "sex"))

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   obs = col_double(),
##   species = col_double(),
##   weight = col_double(),
##   len1 = col_double(),
##   len2 = col_double(),
##   len3 = col_double(),
##   height.pct = col_double(),
##   width.pct = col_double(),
##   sex = col_double()
## )

1.2 Change ‘sex’ to be a factor and rename

# fish$sex <- c("0", "1")
fish$sex <- factor(fish$sex)
fish$sex <- fct_recode(fish$sex,
                  "male" = "0",
                  "female" = "1")

1.3 Change ‘species’ to be a factor and rename

# species <- c("1", "2", "3", "4", "5", "6", "7")
fish$species <- factor(fish$species)
fish$species <- fct_recode(fish$species,
                      "Common Bream" = "1",
                      "Whitefish" = "2",
                      "Roach" = "3",
                      "Silver Bream" = "4",
                      "Smelt" = "5",
                      "Pike" = "6",
                      "Perch" = "7")
# Do not modify the following code:
fish.sub <- filter(fish, !is.na(sex)) # only display rows where sex is not NA
knitr::kable(head(fish.sub), format = "markdown")

obs	species	weight	len1	len2	len3	height.pct	width.pct	sex
14	Common Bream	NA	29.5	32	37.3	37.3	13.6	female
15	Common Bream	600	29.4	32	37.2	40.2	13.9	female
17	Common Bream	700	30.4	33	38.3	38.8	13.8	female
21	Common Bream	575	31.3	34	39.5	38.3	14.1	female
26	Common Bream	725	31.8	35	40.9	40.0	14.8	female
30	Common Bream	1000	33.5	37	42.6	44.5	15.5	male

1.4 Determine mean weight for each species

mean.wt <-
  fish %>%
group_by(species) %>%
  dplyr::summarize(Mean = mean(weight, na.rm=TRUE)) %>% 
  arrange(Mean)
# Do not modify the following code:
knitr::kable(mean.wt, format = "markdown")

species	Mean
Smelt	11.17857
Roach	152.05000
Silver Bream	154.81818
Perch	382.23929
Whitefish	531.00000
Common Bream	626.00000
Pike	718.70588

The species with the smallest mean weight is Smelt with a weight of 11.18 lbs.

1.5 Plot the mean weights for each species

ggplot(mean.wt, aes(x=species, y=Mean)) + geom_bar(stat="identity") + ggtitle("Mean Weight per Species") + labs(y="Weight", x = "Species")

Part 2: Analysis of Forbes Global 2000 Data (50 total points)

2.1 Import the dataset

Forbes <- read_csv("/Users/oliviacochran/Desktop/Grad School/Spring 2021/Strategy Analytics II/R Exercise/2019_Forbes_Global_2000.csv")

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Rank = col_double(),
##   Company = col_character(),
##   Sector = col_character(),
##   Industry = col_character(),
##   Continent = col_character(),
##   Country = col_character(),
##   Revenue = col_double(),
##   Profits = col_double(),
##   Assets = col_double(),
##   Market_Value = col_double()
## )

2.2 Exclude specified records

Forbes <- 
  Forbes %>%
  filter(!is.na(Sector)) %>%
  filter(Revenue >= 0)

2.3 Convert four variables to factors

Sector <- c("Consumer Discretionary", "Consumer Staples", "Energy", "Financials", "Health Care", "Industrials", "Information Technology", "Materials", "Telecommunication Services", "Utilities")
Sector <- factor(Sector)
Sector

##  [1] Consumer Discretionary     Consumer Staples          
##  [3] Energy                     Financials                
##  [5] Health Care                Industrials               
##  [7] Information Technology     Materials                 
##  [9] Telecommunication Services Utilities                 
## 10 Levels: Consumer Discretionary Consumer Staples Energy ... Utilities

Industry <- c("Advertising", "Aerospace & Defense", "Air Courier", "Airline", "Aluminum", "Apparel/Accessories", "Apparel/Footwear Retail", "Auto & Truck Manufacturers", "Auto & Truck Parts", "Beverages", "Biotechs", "Broadcasting & Cable", "Business & Personal Service", "Business Products & Supplies", "Casinos & Gaming", "Communications Equipment", "Computer & Electronics Retail", "Computer Hardware", "Computer Service", "Computer Storage Devices", "Conglomerates", "Construction Materials", "Construction Services", "Consumer Electronics","Consumer Financial Services", "Containers & Packaging", "Department Stores", "Discount Stores", "Diversified Chemicals","Diversified Insurance","Diversified Metals & Mining", "Diversified Utilities", "Drug Retail", "Electric Utilities", "Electrical Equipment", "Electronics", "Environmental & Waste", "Food Processing", "Food Retail", "Forest Products", "Furniture & Fixtures", "Healthcare Services", "Heavy Equipment","Home Improvement & Retail","Hotels & Motels", "Household Appliances", "Household/Personal Care", "Insurance Brokers", "Internet & Catalog Retail", "Investment Services", "Iron & Steel", "Life & Health Insurance", "Major Banks", "Managed Health Care", "Medical Equipment & Supplies", "Natural Gas Utilities", "Oil & Gas Operations", "Oil Services & Equipment", "Other Industrial Equipment", "Other Transportation", "Paper & Paper Products", "Pharmaceuticals", "Printing & Publishing", "Property & Casualty Insurance", "Railroads", "Real Estate", "Recreational Products", "Regional Banks", "Rental & Leasing", "Restaurants","Security Systems","Semiconductors","Software & Programming", "Specialized Chemicals", "Specialty Stores","Telecommunication services","Thrifts & Mortgage Finance","Tobacco","Trading Companies","Trucking")
Industry <- factor(Industry)
Industry

##  [1] Advertising                   Aerospace & Defense          
##  [3] Air Courier                   Airline                      
##  [5] Aluminum                      Apparel/Accessories          
##  [7] Apparel/Footwear Retail       Auto & Truck Manufacturers   
##  [9] Auto & Truck Parts            Beverages                    
## [11] Biotechs                      Broadcasting & Cable         
## [13] Business & Personal Service   Business Products & Supplies 
## [15] Casinos & Gaming              Communications Equipment     
## [17] Computer & Electronics Retail Computer Hardware            
## [19] Computer Service              Computer Storage Devices     
## [21] Conglomerates                 Construction Materials       
## [23] Construction Services         Consumer Electronics         
## [25] Consumer Financial Services   Containers & Packaging       
## [27] Department Stores             Discount Stores              
## [29] Diversified Chemicals         Diversified Insurance        
## [31] Diversified Metals & Mining   Diversified Utilities        
## [33] Drug Retail                   Electric Utilities           
## [35] Electrical Equipment          Electronics                  
## [37] Environmental & Waste         Food Processing              
## [39] Food Retail                   Forest Products              
## [41] Furniture & Fixtures          Healthcare Services          
## [43] Heavy Equipment               Home Improvement & Retail    
## [45] Hotels & Motels               Household Appliances         
## [47] Household/Personal Care       Insurance Brokers            
## [49] Internet & Catalog Retail     Investment Services          
## [51] Iron & Steel                  Life & Health Insurance      
## [53] Major Banks                   Managed Health Care          
## [55] Medical Equipment & Supplies  Natural Gas Utilities        
## [57] Oil & Gas Operations          Oil Services & Equipment     
## [59] Other Industrial Equipment    Other Transportation         
## [61] Paper & Paper Products        Pharmaceuticals              
## [63] Printing & Publishing         Property & Casualty Insurance
## [65] Railroads                     Real Estate                  
## [67] Recreational Products         Regional Banks               
## [69] Rental & Leasing              Restaurants                  
## [71] Security Systems              Semiconductors               
## [73] Software & Programming        Specialized Chemicals        
## [75] Specialty Stores              Telecommunication services   
## [77] Thrifts & Mortgage Finance    Tobacco                      
## [79] Trading Companies             Trucking                     
## 80 Levels: Advertising Aerospace & Defense Air Courier Airline ... Trucking

Continent <- c("Africa", "Asia","Australia","Europe","North America","South America")
Continent <- factor(Continent)
Continent

## [1] Africa        Asia          Australia     Europe        North America
## [6] South America
## Levels: Africa Asia Australia Europe North America South America

Country <- c("Argentina","Australia","Austria","Bahrain","Belgium","Bermuda","Brazil","Canada","Chile","China","Colombia","Czech Republic","Denmark","Egypt","Finland","France","Germany","Greece","Hong Kong","Hungary","India","Indonesia","Ireland","Israel","Italy","Japan","Jordan","Kazakhstan","Kenya","Kuwait","Lebanon","Luxembourg","Malaysia","Mexico","Monaco","Morocco","Netherlands","Nigeria", "Norway", "Oman", "Peru", "Philippines", "Poland", "Portugal","Qatar","Russia","Saudi Arabia","Singapore","South Africa","South Korea","Spain","Sweden","Switzerland","Taiwan","Thailand","Turkey","United Arab Emirates","United Kingdom","United States","Vietnam")
Country <- factor(Country)
Country

##  [1] Argentina            Australia            Austria             
##  [4] Bahrain              Belgium              Bermuda             
##  [7] Brazil               Canada               Chile               
## [10] China                Colombia             Czech Republic      
## [13] Denmark              Egypt                Finland             
## [16] France               Germany              Greece              
## [19] Hong Kong            Hungary              India               
## [22] Indonesia            Ireland              Israel              
## [25] Italy                Japan                Jordan              
## [28] Kazakhstan           Kenya                Kuwait              
## [31] Lebanon              Luxembourg           Malaysia            
## [34] Mexico               Monaco               Morocco             
## [37] Netherlands          Nigeria              Norway              
## [40] Oman                 Peru                 Philippines         
## [43] Poland               Portugal             Qatar               
## [46] Russia               Saudi Arabia         Singapore           
## [49] South Africa         South Korea          Spain               
## [52] Sweden               Switzerland          Taiwan              
## [55] Thailand             Turkey               United Arab Emirates
## [58] United Kingdom       United States        Vietnam             
## 60 Levels: Argentina Australia Austria Bahrain Belgium Bermuda ... Vietnam

2.4 Create a scatterplot of Market Value by Sales

MVS <-
   subset(Forbes, Continent %in% c("Asia","Europe","North America"))
ggplot(MVS, aes(Revenue, Market_Value)) + geom_point() + geom_smooth() + facet_grid(~Continent)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Based on the graphs, Asia, Europe, and North America all have a positive relationship between revenue and market value until revenue reaches about 250. At that point, Asia begins to have a negative relationship between the two variables (as revenue increases, market value decreases), while Europe stays positive. For North America, the relationship between revenue and market value stays positive until revenue reaches about 350 and then sharply declines. North America has the steepest slope out of the three, which is impacted by the outliers in the dataset. Compared to Asia and Europe, North America has not only the most outliers, but also the most extreme.

2.5 Create Profit Margin variable

Forbes <- mutate(Forbes, ProfMgn = Profits / Revenue)
filter(Forbes, ProfMgn < 5)

A tibble: 1,943 x 11

Rank Company Sector Industry Continent Country Revenue Profits Assets

1 1 ICBC Finan… Major B… Asia China 176. 45.2 4034. 2 2 JPMorg… Finan… Major B… North Am… United… 133. 32.7 2737. 3 3 China … Finan… Major B… Asia China 150. 38.8 3382. 4 4 Agricu… Finan… Regiona… Asia China 137. 30.9 3293. 5 5 Bank o… Finan… Major B… North Am… United… 112. 28.5 2377. 6 6 Apple Infor… Compute… North Am… United… 262. 59.4 374. 7 7 Ping A… Finan… Diversi… Asia China 152. 16.3 1038. 8 8 Bank o… Finan… Major B… Asia China 127. 27.5 3098. 9 9 Royal … Energy Oil & G… Europe Nether… 383. 23.3 399. 10 10 Wells … Finan… Major B… North Am… United… 101. 23.1 1888. # … with 1,933 more rows, and 2 more variables: Market_Value , # ProfMgn

# Do not modify the following code:
knitr::kable(head(Forbes), format = "markdown")

Rank	Company	Sector	Industry	Continent	Country	Revenue	Profits	Assets	Market_Value	ProfMgn
1	ICBC	Financials	Major Banks	Asia	China	175.874	45.223	4034.482	305.057	0.2571329
2	JPMorgan Chase	Financials	Major Banks	North America	United States	132.912	32.738	2737.188	368.502	0.2463134
3	China Construction Bank	Financials	Major Banks	Asia	China	150.313	38.841	3382.422	224.988	0.2584008
4	Agricultural Bank of China	Financials	Regional Banks	Asia	China	137.456	30.894	3293.105	197.045	0.2247556
5	Bank of America	Financials	Major Banks	North America	United States	111.904	28.540	2377.164	287.339	0.2550400
6	Apple	Information Technology	Computer Hardware	North America	United States	261.705	59.431	373.719	961.257	0.2270916

2.6 Create boxplot of Profit Margin by Sector

ggplot(Forbes, aes(ProfMgn, Sector)) + geom_boxplot()

## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

The sector that appears to have the greatest standard deviation is Financials.

2.7 Calculate the SD for each sector

Forbes.SD <- group_by(Forbes, Sector)
Forbes.SD <- summarise(Forbes.SD, stnd_dev = sd(ProfMgn, na.rm= TRUE))
Forbes.SD <- arrange(Forbes.SD, desc(stnd_dev))
# Do not modify the following code:
knitr::kable(Forbes.SD, format = "markdown")

Sector	stnd_dev
Financials	7.5349304
Consumer Discretionary	2.0406190
Utilities	0.4912375
Telecommunication Services	0.4698143
Information Technology	0.2303487
Industrials	0.1611717
Materials	0.1566098
Health Care	0.1488790
Consumer Staples	0.1282274
Energy	0.1089204

The sector that has the greatest standard deviation is Financials, as was estimated, with a standard deviation of 7.53. Based on the previous graph, Financials has one large outlier, which could describe why the standard deviation is so much larger for Financials than any of the other sectors in this dataset. Consumer Discretionary has the second highest standard deviation, with 2.04. The graph also showed that Cosnumer Discretionary has an outlier, although it’s not as large as the outlier for Financials. The remaining sectors all have a standard deviation less than 1, which indicates that datapoints are clustered around the mean of each sector.

R Exercise

Olivia Cochran

Spring 2021