fish <- read_table2("http://jse.amstat.org/datasets/fishcatch.dat.txt",
col_names = c("obs", "species", "weight", "len1", "len2", "len3", "height.pct", "width.pct", "sex"))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## obs = col_double(),
## species = col_double(),
## weight = col_double(),
## len1 = col_double(),
## len2 = col_double(),
## len3 = col_double(),
## height.pct = col_double(),
## width.pct = col_double(),
## sex = col_double()
## )
# fish$sex <- c("0", "1")
fish$sex <- factor(fish$sex)
fish$sex <- fct_recode(fish$sex,
"male" = "0",
"female" = "1")
# species <- c("1", "2", "3", "4", "5", "6", "7")
fish$species <- factor(fish$species)
fish$species <- fct_recode(fish$species,
"Common Bream" = "1",
"Whitefish" = "2",
"Roach" = "3",
"Silver Bream" = "4",
"Smelt" = "5",
"Pike" = "6",
"Perch" = "7")
# Do not modify the following code:
fish.sub <- filter(fish, !is.na(sex)) # only display rows where sex is not NA
knitr::kable(head(fish.sub), format = "markdown")
| obs | species | weight | len1 | len2 | len3 | height.pct | width.pct | sex |
|---|---|---|---|---|---|---|---|---|
| 14 | Common Bream | NA | 29.5 | 32 | 37.3 | 37.3 | 13.6 | female |
| 15 | Common Bream | 600 | 29.4 | 32 | 37.2 | 40.2 | 13.9 | female |
| 17 | Common Bream | 700 | 30.4 | 33 | 38.3 | 38.8 | 13.8 | female |
| 21 | Common Bream | 575 | 31.3 | 34 | 39.5 | 38.3 | 14.1 | female |
| 26 | Common Bream | 725 | 31.8 | 35 | 40.9 | 40.0 | 14.8 | female |
| 30 | Common Bream | 1000 | 33.5 | 37 | 42.6 | 44.5 | 15.5 | male |
mean.wt <-
fish %>%
group_by(species) %>%
dplyr::summarize(Mean = mean(weight, na.rm=TRUE)) %>%
arrange(Mean)
# Do not modify the following code:
knitr::kable(mean.wt, format = "markdown")
| species | Mean |
|---|---|
| Smelt | 11.17857 |
| Roach | 152.05000 |
| Silver Bream | 154.81818 |
| Perch | 382.23929 |
| Whitefish | 531.00000 |
| Common Bream | 626.00000 |
| Pike | 718.70588 |
The species with the smallest mean weight is Smelt with a weight of 11.18 lbs.
ggplot(mean.wt, aes(x=species, y=Mean)) + geom_bar(stat="identity") + ggtitle("Mean Weight per Species") + labs(y="Weight", x = "Species")
Forbes <- read_csv("/Users/oliviacochran/Desktop/Grad School/Spring 2021/Strategy Analytics II/R Exercise/2019_Forbes_Global_2000.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Rank = col_double(),
## Company = col_character(),
## Sector = col_character(),
## Industry = col_character(),
## Continent = col_character(),
## Country = col_character(),
## Revenue = col_double(),
## Profits = col_double(),
## Assets = col_double(),
## Market_Value = col_double()
## )
Forbes <-
Forbes %>%
filter(!is.na(Sector)) %>%
filter(Revenue >= 0)
Sector <- c("Consumer Discretionary", "Consumer Staples", "Energy", "Financials", "Health Care", "Industrials", "Information Technology", "Materials", "Telecommunication Services", "Utilities")
Sector <- factor(Sector)
Sector
## [1] Consumer Discretionary Consumer Staples
## [3] Energy Financials
## [5] Health Care Industrials
## [7] Information Technology Materials
## [9] Telecommunication Services Utilities
## 10 Levels: Consumer Discretionary Consumer Staples Energy ... Utilities
Industry <- c("Advertising", "Aerospace & Defense", "Air Courier", "Airline", "Aluminum", "Apparel/Accessories", "Apparel/Footwear Retail", "Auto & Truck Manufacturers", "Auto & Truck Parts", "Beverages", "Biotechs", "Broadcasting & Cable", "Business & Personal Service", "Business Products & Supplies", "Casinos & Gaming", "Communications Equipment", "Computer & Electronics Retail", "Computer Hardware", "Computer Service", "Computer Storage Devices", "Conglomerates", "Construction Materials", "Construction Services", "Consumer Electronics","Consumer Financial Services", "Containers & Packaging", "Department Stores", "Discount Stores", "Diversified Chemicals","Diversified Insurance","Diversified Metals & Mining", "Diversified Utilities", "Drug Retail", "Electric Utilities", "Electrical Equipment", "Electronics", "Environmental & Waste", "Food Processing", "Food Retail", "Forest Products", "Furniture & Fixtures", "Healthcare Services", "Heavy Equipment","Home Improvement & Retail","Hotels & Motels", "Household Appliances", "Household/Personal Care", "Insurance Brokers", "Internet & Catalog Retail", "Investment Services", "Iron & Steel", "Life & Health Insurance", "Major Banks", "Managed Health Care", "Medical Equipment & Supplies", "Natural Gas Utilities", "Oil & Gas Operations", "Oil Services & Equipment", "Other Industrial Equipment", "Other Transportation", "Paper & Paper Products", "Pharmaceuticals", "Printing & Publishing", "Property & Casualty Insurance", "Railroads", "Real Estate", "Recreational Products", "Regional Banks", "Rental & Leasing", "Restaurants","Security Systems","Semiconductors","Software & Programming", "Specialized Chemicals", "Specialty Stores","Telecommunication services","Thrifts & Mortgage Finance","Tobacco","Trading Companies","Trucking")
Industry <- factor(Industry)
Industry
## [1] Advertising Aerospace & Defense
## [3] Air Courier Airline
## [5] Aluminum Apparel/Accessories
## [7] Apparel/Footwear Retail Auto & Truck Manufacturers
## [9] Auto & Truck Parts Beverages
## [11] Biotechs Broadcasting & Cable
## [13] Business & Personal Service Business Products & Supplies
## [15] Casinos & Gaming Communications Equipment
## [17] Computer & Electronics Retail Computer Hardware
## [19] Computer Service Computer Storage Devices
## [21] Conglomerates Construction Materials
## [23] Construction Services Consumer Electronics
## [25] Consumer Financial Services Containers & Packaging
## [27] Department Stores Discount Stores
## [29] Diversified Chemicals Diversified Insurance
## [31] Diversified Metals & Mining Diversified Utilities
## [33] Drug Retail Electric Utilities
## [35] Electrical Equipment Electronics
## [37] Environmental & Waste Food Processing
## [39] Food Retail Forest Products
## [41] Furniture & Fixtures Healthcare Services
## [43] Heavy Equipment Home Improvement & Retail
## [45] Hotels & Motels Household Appliances
## [47] Household/Personal Care Insurance Brokers
## [49] Internet & Catalog Retail Investment Services
## [51] Iron & Steel Life & Health Insurance
## [53] Major Banks Managed Health Care
## [55] Medical Equipment & Supplies Natural Gas Utilities
## [57] Oil & Gas Operations Oil Services & Equipment
## [59] Other Industrial Equipment Other Transportation
## [61] Paper & Paper Products Pharmaceuticals
## [63] Printing & Publishing Property & Casualty Insurance
## [65] Railroads Real Estate
## [67] Recreational Products Regional Banks
## [69] Rental & Leasing Restaurants
## [71] Security Systems Semiconductors
## [73] Software & Programming Specialized Chemicals
## [75] Specialty Stores Telecommunication services
## [77] Thrifts & Mortgage Finance Tobacco
## [79] Trading Companies Trucking
## 80 Levels: Advertising Aerospace & Defense Air Courier Airline ... Trucking
Continent <- c("Africa", "Asia","Australia","Europe","North America","South America")
Continent <- factor(Continent)
Continent
## [1] Africa Asia Australia Europe North America
## [6] South America
## Levels: Africa Asia Australia Europe North America South America
Country <- c("Argentina","Australia","Austria","Bahrain","Belgium","Bermuda","Brazil","Canada","Chile","China","Colombia","Czech Republic","Denmark","Egypt","Finland","France","Germany","Greece","Hong Kong","Hungary","India","Indonesia","Ireland","Israel","Italy","Japan","Jordan","Kazakhstan","Kenya","Kuwait","Lebanon","Luxembourg","Malaysia","Mexico","Monaco","Morocco","Netherlands","Nigeria", "Norway", "Oman", "Peru", "Philippines", "Poland", "Portugal","Qatar","Russia","Saudi Arabia","Singapore","South Africa","South Korea","Spain","Sweden","Switzerland","Taiwan","Thailand","Turkey","United Arab Emirates","United Kingdom","United States","Vietnam")
Country <- factor(Country)
Country
## [1] Argentina Australia Austria
## [4] Bahrain Belgium Bermuda
## [7] Brazil Canada Chile
## [10] China Colombia Czech Republic
## [13] Denmark Egypt Finland
## [16] France Germany Greece
## [19] Hong Kong Hungary India
## [22] Indonesia Ireland Israel
## [25] Italy Japan Jordan
## [28] Kazakhstan Kenya Kuwait
## [31] Lebanon Luxembourg Malaysia
## [34] Mexico Monaco Morocco
## [37] Netherlands Nigeria Norway
## [40] Oman Peru Philippines
## [43] Poland Portugal Qatar
## [46] Russia Saudi Arabia Singapore
## [49] South Africa South Korea Spain
## [52] Sweden Switzerland Taiwan
## [55] Thailand Turkey United Arab Emirates
## [58] United Kingdom United States Vietnam
## 60 Levels: Argentina Australia Austria Bahrain Belgium Bermuda ... Vietnam
MVS <-
subset(Forbes, Continent %in% c("Asia","Europe","North America"))
ggplot(MVS, aes(Revenue, Market_Value)) + geom_point() + geom_smooth() + facet_grid(~Continent)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Based on the graphs, Asia, Europe, and North America all have a positive relationship between revenue and market value until revenue reaches about 250. At that point, Asia begins to have a negative relationship between the two variables (as revenue increases, market value decreases), while Europe stays positive. For North America, the relationship between revenue and market value stays positive until revenue reaches about 350 and then sharply declines. North America has the steepest slope out of the three, which is impacted by the outliers in the dataset. Compared to Asia and Europe, North America has not only the most outliers, but also the most extreme.
Forbes <- mutate(Forbes, ProfMgn = Profits / Revenue)
filter(Forbes, ProfMgn < 5)
Rank Company Sector Industry Continent Country Revenue Profits Assets
# Do not modify the following code:
knitr::kable(head(Forbes), format = "markdown")
| Rank | Company | Sector | Industry | Continent | Country | Revenue | Profits | Assets | Market_Value | ProfMgn |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | ICBC | Financials | Major Banks | Asia | China | 175.874 | 45.223 | 4034.482 | 305.057 | 0.2571329 |
| 2 | JPMorgan Chase | Financials | Major Banks | North America | United States | 132.912 | 32.738 | 2737.188 | 368.502 | 0.2463134 |
| 3 | China Construction Bank | Financials | Major Banks | Asia | China | 150.313 | 38.841 | 3382.422 | 224.988 | 0.2584008 |
| 4 | Agricultural Bank of China | Financials | Regional Banks | Asia | China | 137.456 | 30.894 | 3293.105 | 197.045 | 0.2247556 |
| 5 | Bank of America | Financials | Major Banks | North America | United States | 111.904 | 28.540 | 2377.164 | 287.339 | 0.2550400 |
| 6 | Apple | Information Technology | Computer Hardware | North America | United States | 261.705 | 59.431 | 373.719 | 961.257 | 0.2270916 |
ggplot(Forbes, aes(ProfMgn, Sector)) + geom_boxplot()
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
The sector that appears to have the greatest standard deviation is Financials.
Forbes.SD <- group_by(Forbes, Sector)
Forbes.SD <- summarise(Forbes.SD, stnd_dev = sd(ProfMgn, na.rm= TRUE))
Forbes.SD <- arrange(Forbes.SD, desc(stnd_dev))
# Do not modify the following code:
knitr::kable(Forbes.SD, format = "markdown")
| Sector | stnd_dev |
|---|---|
| Financials | 7.5349304 |
| Consumer Discretionary | 2.0406190 |
| Utilities | 0.4912375 |
| Telecommunication Services | 0.4698143 |
| Information Technology | 0.2303487 |
| Industrials | 0.1611717 |
| Materials | 0.1566098 |
| Health Care | 0.1488790 |
| Consumer Staples | 0.1282274 |
| Energy | 0.1089204 |
The sector that has the greatest standard deviation is Financials, as was estimated, with a standard deviation of 7.53. Based on the previous graph, Financials has one large outlier, which could describe why the standard deviation is so much larger for Financials than any of the other sectors in this dataset. Consumer Discretionary has the second highest standard deviation, with 2.04. The graph also showed that Cosnumer Discretionary has an outlier, although it’s not as large as the outlier for Financials. The remaining sectors all have a standard deviation less than 1, which indicates that datapoints are clustered around the mean of each sector.